Update regex engine to latest (2.4.61)

This commit is contained in:
Kovid Goyal 2015-05-25 09:55:10 +05:30
parent 2484ed7b61
commit 2068e52b82
4 changed files with 743 additions and 314 deletions

View File

@ -1,4 +1,4 @@
This regex engine is taken, with thanks, from: https://code.google.com/p/mrab-regex-hg/ This regex engine is taken, with thanks, from: https://bitbucket.org/mrabarnett/mrab-regex
It is licensed under the Python Software Foundation License It is licensed under the Python Software Foundation License

View File

@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
"Regex"] "Regex"]
__version__ = "2.4.48" __version__ = "2.4.61"
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# Public interface. # Public interface.
@ -333,6 +333,7 @@ def compile(pattern, flags=0, **kwargs):
def purge(): def purge():
"Clear the regular expression cache" "Clear the regular expression cache"
_cache.clear() _cache.clear()
_locale_sensitive.clear()
def template(pattern, flags=0): def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object." "Compile a template pattern, returning a pattern object."
@ -423,38 +424,43 @@ _MAXREPCACHE = 500
def _compile(pattern, flags=0, kwargs={}): def _compile(pattern, flags=0, kwargs={}):
"Compiles a regular expression to a PatternObject." "Compiles a regular expression to a PatternObject."
# We won't bother to cache the pattern if we're debugging.
debugging = (flags & DEBUG) != 0
# What locale is this pattern using? # What locale is this pattern using?
locale_key = (type(pattern), pattern) locale_key = (type(pattern), pattern)
if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
# This pattern is, or might be, locale-sensitive. # This pattern is, or might be, locale-sensitive.
pattern_locale = _getlocale() pattern_locale = _getlocale()[1]
else: else:
# This pattern is definitely not locale-sensitive. # This pattern is definitely not locale-sensitive.
pattern_locale = None pattern_locale = None
try: if not debugging:
# Do we know what keyword arguments are needed? try:
args_key = pattern, type(pattern), flags # Do we know what keyword arguments are needed?
args_needed = _named_args[args_key] args_key = pattern, type(pattern), flags
args_needed = _named_args[args_key]
# Are we being provided with its required keyword arguments? # Are we being provided with its required keyword arguments?
args_supplied = set() args_supplied = set()
if args_needed: if args_needed:
for k, v in args_needed: for k, v in args_needed:
try: try:
args_supplied.add((k, frozenset(kwargs[k]))) args_supplied.add((k, frozenset(kwargs[k])))
except KeyError: except KeyError:
raise error("missing named list: {!r}".format(k)) raise error("missing named list: {!r}".format(k))
args_supplied = frozenset(args_supplied) args_supplied = frozenset(args_supplied)
# Have we already seen this regular expression and named list? # Have we already seen this regular expression and named list?
pattern_key = (pattern, type(pattern), flags, args_supplied, pattern_key = (pattern, type(pattern), flags, args_supplied,
DEFAULT_VERSION, pattern_locale) DEFAULT_VERSION, pattern_locale)
return _cache[pattern_key] return _cache[pattern_key]
except KeyError: except KeyError:
# It's a new pattern, or new named list for a known pattern. # It's a new pattern, or new named list for a known pattern.
pass pass
# Guess the encoding from the class of the pattern string. # Guess the encoding from the class of the pattern string.
if isinstance(pattern, unicode): if isinstance(pattern, unicode):
@ -463,7 +469,7 @@ def _compile(pattern, flags=0, kwargs={}):
guess_encoding = ASCII guess_encoding = ASCII
elif isinstance(pattern, _pattern_type): elif isinstance(pattern, _pattern_type):
if flags: if flags:
raise ValueError("can't process flags argument with a compiled pattern") raise ValueError("cannot process flags argument with a compiled pattern")
return pattern return pattern
else: else:
@ -490,10 +496,11 @@ def _compile(pattern, flags=0, kwargs={}):
caught_exception = e caught_exception = e
if caught_exception: if caught_exception:
raise error(str(caught_exception)) raise error(caught_exception.msg, caught_exception.pattern,
caught_exception.pos)
if not source.at_end(): if not source.at_end():
raise error("trailing characters in pattern at position %d" % source.pos) raise error("unbalanced parenthesis", pattern, source.pos)
# Check the global flags for conflicts. # Check the global flags for conflicts.
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
@ -520,7 +527,7 @@ def _compile(pattern, flags=0, kwargs={}):
parsed.dump(indent=0, reverse=reverse) parsed.dump(indent=0, reverse=reverse)
# Fix the group references. # Fix the group references.
parsed.fix_groups(reverse, False) parsed.fix_groups(pattern, reverse, False)
# Optimise the parsed pattern. # Optimise the parsed pattern.
parsed = parsed.optimise(info) parsed = parsed.optimise(info)
@ -591,19 +598,23 @@ def _compile(pattern, flags=0, kwargs={}):
if len(_cache) >= _MAXCACHE: if len(_cache) >= _MAXCACHE:
_cache_lock.acquire() _cache_lock.acquire()
try: try:
_shrink_cache(_cache, _named_args, _MAXCACHE) _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE)
finally: finally:
_cache_lock.release() _cache_lock.release()
args_needed = frozenset(args_needed) if not debugging:
if (info.flags & LOCALE) == 0:
pattern_locale = None
# Store this regular expression and named list. args_needed = frozenset(args_needed)
pattern_key = (pattern, type(pattern), flags, args_needed, DEFAULT_VERSION,
pattern_locale)
_cache[pattern_key] = compiled_pattern
# Store what keyword arguments are needed. # Store this regular expression and named list.
_named_args[args_key] = args_needed pattern_key = (pattern, type(pattern), flags, args_needed,
DEFAULT_VERSION, pattern_locale)
_cache[pattern_key] = compiled_pattern
# Store what keyword arguments are needed.
_named_args[args_key] = args_needed
return compiled_pattern return compiled_pattern

File diff suppressed because it is too large Load Diff

View File

@ -31,9 +31,21 @@ __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
# The regex exception. # The regex exception.
class error(Exception): class error(Exception):
def __init__(self, message, set_error=False): def __init__(self, message, pattern=None, pos=None):
newline = u'\n' if isinstance(pattern, unicode) else '\n'
self.msg = message
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
message = "%s at position %d" % (message, pos)
if newline in pattern:
message += " (line %d, column %d)" % (self.lineno, self.colno)
Exception.__init__(self, message) Exception.__init__(self, message)
self.set_error = set_error
# The exception for when a positional flag has been turned on in the old # The exception for when a positional flag has been turned on in the old
# behaviour. # behaviour.
@ -210,7 +222,7 @@ OP = Namespace()
for i, op in enumerate(OPCODES.split()): for i, op in enumerate(OPCODES.split()):
setattr(OP, op, i) setattr(OP, op, i)
def _shrink_cache(cache_dict, args_dict, max_length, divisor=5): def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
"""Make room in the given cache. """Make room in the given cache.
Args: Args:
@ -247,10 +259,18 @@ def _shrink_cache(cache_dict, args_dict, max_length, divisor=5):
# Ignore problems if the cache changed from another thread. # Ignore problems if the cache changed from another thread.
pass pass
# Rebuild the arguments dictionary. # Rebuild the arguments and locale-sensitivity dictionaries.
args_dict.clear() args_dict.clear()
sensitivity_dict = {}
for pattern, pattern_type, flags, args, default_version, locale in cache_dict: for pattern, pattern_type, flags, args, default_version, locale in cache_dict:
args_dict[pattern, pattern_type, flags, default_version, locale] = args args_dict[pattern, pattern_type, flags, default_version, locale] = args
try:
sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
except KeyError:
pass
locale_sensitive.clear()
locale_sensitive.update(sensitivity_dict)
def _fold_case(info, string): def _fold_case(info, string):
"Folds the case of a string." "Folds the case of a string."
@ -384,8 +404,11 @@ def apply_quantifier(source, info, counts, characters, case_flags, ch,
element = Character(characters[-1], case_flags=case_flags) element = Character(characters[-1], case_flags=case_flags)
else: else:
# The quantifier applies to the last item in the sequence. # The quantifier applies to the last item in the sequence.
if applied or not sequence: if applied:
raise error("nothing to repeat at position %d" % saved_pos) raise error("multiple repeat", source.string, saved_pos)
if not sequence:
raise error("nothing to repeat", source.string, saved_pos)
element = sequence.pop() element = sequence.pop()
@ -420,7 +443,8 @@ def apply_constraint(source, info, constraints, characters, case_flags,
else: else:
# The constraint applies to the last item in the sequence. # The constraint applies to the last item in the sequence.
if applied or not sequence: if applied or not sequence:
raise error("nothing for fuzzy constraint at position %d" % saved_pos) raise error("nothing for fuzzy constraint", source.string,
saved_pos)
element = sequence.pop() element = sequence.pop()
@ -473,7 +497,8 @@ def parse_limited_quantifier(source):
max_count = int(max_count) if max_count else None max_count = int(max_count) if max_count else None
if max_count is not None and min_count > max_count: if max_count is not None and min_count > max_count:
raise error("min repeat greater than max repeat at position %d" % saved_pos) raise error("min repeat greater than max repeat", source.string,
saved_pos)
else: else:
if not min_count: if not min_count:
source.pos = saved_pos source.pos = saved_pos
@ -482,7 +507,7 @@ def parse_limited_quantifier(source):
min_count = max_count = int(min_count) min_count = max_count = int(min_count)
if is_above_limit(min_count) or is_above_limit(max_count): if is_above_limit(min_count) or is_above_limit(max_count):
raise error("repeat count too big at position %d" % saved_pos) raise error("repeat count too big", source.string, saved_pos)
if not source.match ("}"): if not source.match ("}"):
source.pos = saved_pos source.pos = saved_pos
@ -507,7 +532,7 @@ def parse_fuzzy(source, ch):
return None return None
if not source.match("}"): if not source.match("}"):
raise error("expected } at position %d" % source.pos) raise error("expected }", source.string, source.pos)
return constraints return constraints
@ -544,7 +569,7 @@ def parse_cost_constraint(source, constraints):
max_cost -= 1 max_cost -= 1
if max_cost < 0: if max_cost < 0:
raise error("bad fuzzy cost limit at position %d" % cost_pos) raise error("bad fuzzy cost limit", source.string, cost_pos)
constraints[constraint] = 0, max_cost constraints[constraint] = 0, max_cost
elif ch in DIGITS: elif ch in DIGITS:
@ -575,7 +600,7 @@ def parse_cost_constraint(source, constraints):
max_cost -= 1 max_cost -= 1
if not 0 <= min_cost <= max_cost: if not 0 <= min_cost <= max_cost:
raise error("bad fuzzy cost limit at position %d" % cost_pos) raise error("bad fuzzy cost limit", source.string, cost_pos)
constraints[constraint] = min_cost, max_cost constraints[constraint] = min_cost, max_cost
except ValueError: except ValueError:
@ -586,10 +611,10 @@ def parse_cost_constraint(source, constraints):
def parse_constraint(source, constraints, ch): def parse_constraint(source, constraints, ch):
"Parses a constraint." "Parses a constraint."
if ch not in "deis": if ch not in "deis":
raise error("bad fuzzy constraint at position %d" % source.pos) raise error("bad fuzzy constraint", source.string, source.pos)
if ch in constraints: if ch in constraints:
raise error("repeated fuzzy constraint at position %d" % source.pos) raise error("repeated fuzzy constraint", source.string, source.pos)
return ch return ch
@ -605,7 +630,7 @@ def parse_fuzzy_compare(source):
def parse_cost_equation(source, constraints): def parse_cost_equation(source, constraints):
"Parses a cost equation." "Parses a cost equation."
if "cost" in constraints: if "cost" in constraints:
raise error("more than one cost equation at position %d" % source.pos) raise error("more than one cost equation", source.string, source.pos)
cost = {} cost = {}
@ -615,7 +640,7 @@ def parse_cost_equation(source, constraints):
max_inc = parse_fuzzy_compare(source) max_inc = parse_fuzzy_compare(source)
if max_inc is None: if max_inc is None:
raise error("missing fuzzy cost limit at position %d" % source.pos) raise error("missing fuzzy cost limit", source.string, source.pos)
max_cost = int(parse_count(source)) max_cost = int(parse_count(source))
@ -623,7 +648,7 @@ def parse_cost_equation(source, constraints):
max_cost -= 1 max_cost -= 1
if max_cost < 0: if max_cost < 0:
raise error("bad fuzzy cost limit at position %d" % source.pos) raise error("bad fuzzy cost limit", source.string, source.pos)
cost["max"] = max_cost cost["max"] = max_cost
@ -637,7 +662,7 @@ def parse_cost_term(source, cost):
raise ParseError() raise ParseError()
if ch in cost: if ch in cost:
raise error("repeated fuzzy cost at position %d" % source.pos) raise error("repeated fuzzy cost", source.string, source.pos)
cost[ch] = int(coeff or 1) cost[ch] = int(coeff or 1)
@ -816,10 +841,11 @@ def parse_extension(source, info):
return Group(info, group, subpattern) return Group(info, group, subpattern)
if ch == "=": if ch == "=":
# (?P=...: a named group reference. # (?P=...: a named group reference.
name = parse_name(source) name = parse_name(source, allow_numeric=True)
source.expect(")") source.expect(")")
if info.is_open_group(name): if info.is_open_group(name):
raise error("can't refer to an open group at position %d" % saved_pos) raise error("cannot refer to an open group", source.string,
saved_pos)
return make_ref_group(info, name, saved_pos) return make_ref_group(info, name, saved_pos)
if ch == ">" or ch == "&": if ch == ">" or ch == "&":
@ -827,7 +853,7 @@ def parse_extension(source, info):
return parse_call_named_group(source, info, saved_pos) return parse_call_named_group(source, info, saved_pos)
source.pos = saved_pos source.pos = saved_pos
raise error("unknown extension at position %d" % saved_pos) raise error("unknown extension", source.string, saved_pos)
def parse_comment(source): def parse_comment(source):
"Parses a comment." "Parses a comment."
@ -941,7 +967,8 @@ def parse_flags(source, info):
if source.match("-"): if source.match("-"):
flags_off = parse_flag_set(source) flags_off = parse_flag_set(source)
if not flags_off: if not flags_off:
raise error("bad inline flags: no flags after '-' at position %d" % source.pos) raise error("bad inline flags: no flags after '-'", source.string,
source.pos)
else: else:
flags_off = 0 flags_off = 0
@ -973,10 +1000,12 @@ def parse_flags_subpattern(source, info):
flags_on, flags_off = parse_flags(source, info) flags_on, flags_off = parse_flags(source, info)
if flags_off & GLOBAL_FLAGS: if flags_off & GLOBAL_FLAGS:
raise error("bad inline flags: can't turn off global flag at position %d" % source.pos) raise error("bad inline flags: cannot turn off global flag",
source.string, source.pos)
if flags_on & flags_off: if flags_on & flags_off:
raise error("bad inline flags: flag turned on and off at position %d" % source.pos) raise error("bad inline flags: flag turned on and off", source.string,
source.pos)
# Handle flags which are global in all regex behaviours. # Handle flags which are global in all regex behaviours.
new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
@ -996,7 +1025,7 @@ def parse_flags_subpattern(source, info):
parse_positional_flags(source, info, flags_on, flags_off) parse_positional_flags(source, info, flags_on, flags_off)
return FLAGS return FLAGS
raise error("unknown extension at position %d" % source.pos) raise error("unknown extension", source.string, source.pos)
def parse_positional_flags(source, info, flags_on, flags_off): def parse_positional_flags(source, info, flags_on, flags_off):
"Parses positional flags." "Parses positional flags."
@ -1004,7 +1033,8 @@ def parse_positional_flags(source, info, flags_on, flags_off):
if version == VERSION0: if version == VERSION0:
# Positional flags are global and can only be turned on. # Positional flags are global and can only be turned on.
if flags_off: if flags_off:
raise error("bad inline flags: can't turn flags off at position %d" % source.pos) raise error("bad inline flags: cannot turn flags off",
source.string, source.pos)
new_global_flags = flags_on & ~info.global_flags new_global_flags = flags_on & ~info.global_flags
if new_global_flags: if new_global_flags:
@ -1017,19 +1047,22 @@ def parse_positional_flags(source, info, flags_on, flags_off):
source.ignore_space = bool(info.flags & VERBOSE) source.ignore_space = bool(info.flags & VERBOSE)
def parse_name(source, allow_numeric=False): def parse_name(source, allow_numeric=False, allow_group_0=False):
"Parses a name." "Parses a name."
name = source.get_while(set(")>"), include=False) name = source.get_while(set(")>"), include=False)
if not name: if not name:
raise error("bad group name at position %d" % source.pos) raise error("missing group name", source.string, source.pos)
if name.isdigit(): if name.isdigit():
if not allow_numeric: min_group = 0 if allow_group_0 else 1
raise error("bad group name at position %d" % source.pos) if not allow_numeric or int(name) < min_group:
raise error("bad character in group name", source.string,
source.pos)
else: else:
if not is_identifier(name): if not is_identifier(name):
raise error("bad group name at position %d" % source.pos) raise error("bad character in group name", source.string,
source.pos)
return name return name
@ -1064,10 +1097,10 @@ def parse_escape(source, info, in_set):
source.ignore_space = saved_ignore source.ignore_space = saved_ignore
if not ch: if not ch:
# A backslash at the end of the pattern. # A backslash at the end of the pattern.
raise error("bad escape at position %d" % source.pos) raise error("bad escape (end of pattern)", source.string, source.pos)
if ch in HEX_ESCAPES: if ch in HEX_ESCAPES:
# A hexadecimal escape sequence. # A hexadecimal escape sequence.
return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set) return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set, ch)
elif ch == "g" and not in_set: elif ch == "g" and not in_set:
# A group reference. # A group reference.
saved_pos = source.pos saved_pos = source.pos
@ -1150,7 +1183,7 @@ def parse_numeric_escape(source, info, ch, in_set):
# Group reference. # Group reference.
source.pos = saved_pos source.pos = saved_pos
if info.is_open_group(digits): if info.is_open_group(digits):
raise error("can't refer to an open group at position %d" % source.pos) raise error("cannot refer to an open group", source.string, source.pos)
return make_ref_group(info, digits, source.pos) return make_ref_group(info, digits, source.pos)
@ -1168,15 +1201,21 @@ def parse_octal_escape(source, info, digits, in_set):
value = int("".join(digits), 8) value = int("".join(digits), 8)
return make_character(info, value, in_set) return make_character(info, value, in_set)
except ValueError: except ValueError:
raise error("bad octal escape at position %d" % source.pos) if digits[0] in OCT_DIGITS:
raise error("incomplete escape \\%s" % ''.join(digits),
source.string, source.pos)
else:
raise error("bad escape \\%s" % digits[0], source.string,
source.pos)
def parse_hex_escape(source, info, expected_len, in_set): def parse_hex_escape(source, info, expected_len, in_set, type):
"Parses a hex escape sequence." "Parses a hex escape sequence."
digits = [] digits = []
for i in range(expected_len): for i in range(expected_len):
ch = source.get() ch = source.get()
if ch not in HEX_DIGITS: if ch not in HEX_DIGITS:
raise error("bad hex escape at position %d" % source.pos) raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
source.string, source.pos)
digits.append(ch) digits.append(ch)
value = int("".join(digits), 16) value = int("".join(digits), 16)
@ -1189,7 +1228,7 @@ def parse_group_ref(source, info):
name = parse_name(source, True) name = parse_name(source, True)
source.expect(">") source.expect(">")
if info.is_open_group(name): if info.is_open_group(name):
raise error("can't refer to an open group at position %d" % source.pos) raise error("cannot refer to an open group", source.string, source.pos)
return make_ref_group(info, name, saved_pos) return make_ref_group(info, name, saved_pos)
@ -1199,7 +1238,7 @@ def parse_string_set(source, info):
name = parse_name(source, True) name = parse_name(source, True)
source.expect(">") source.expect(">")
if name is None or name not in info.kwargs: if name is None or name not in info.kwargs:
raise error("undefined named list at position %d" % source.pos) raise error("undefined named list", source.string, source.pos)
return make_string_set(info, name) return make_string_set(info, name)
@ -1213,7 +1252,8 @@ def parse_named_char(source, info, in_set):
value = unicodedata.lookup(name) value = unicodedata.lookup(name)
return make_character(info, ord(value), in_set) return make_character(info, ord(value), in_set)
except KeyError: except KeyError:
raise error("undefined character name at position %d" % source.pos) raise error("undefined character name", source.string,
source.pos)
source.pos = saved_pos source.pos = saved_pos
return make_character(info, ord("N"), in_set) return make_character(info, ord("N"), in_set)
@ -1227,12 +1267,12 @@ def parse_property(source, info, positive, in_set):
prop_name, name = parse_property_name(source) prop_name, name = parse_property_name(source)
if source.match("}"): if source.match("}"):
# It's correctly delimited. # It's correctly delimited.
prop = lookup_property(prop_name, name, positive != negate, source_pos=source.pos) prop = lookup_property(prop_name, name, positive != negate, source)
return make_property(info, prop, in_set) return make_property(info, prop, in_set)
elif ch and ch in "CLMNPSZ": elif ch and ch in "CLMNPSZ":
# An abbreviated property, eg \pL. # An abbreviated property, eg \pL.
prop = lookup_property(None, ch, positive) prop = lookup_property(None, ch, positive, source)
return make_property(info, prop, in_set, source_pos=source.pos) return make_property(info, prop, in_set)
# Not a property, so treat as a literal "p" or "P". # Not a property, so treat as a literal "p" or "P".
source.pos = saved_pos source.pos = saved_pos
@ -1276,7 +1316,7 @@ def parse_set(source, info):
item = parse_set_union(source, info) item = parse_set_union(source, info)
if not source.match("]"): if not source.match("]"):
raise error("missing ] at position %d" % source.pos) raise error("missing ]", source.string, source.pos)
finally: finally:
source.ignore_space = saved_ignore source.ignore_space = saved_ignore
@ -1354,17 +1394,26 @@ def parse_set_member(source, info):
"Parses a member in a character set." "Parses a member in a character set."
# Parse a set item. # Parse a set item.
start = parse_set_item(source, info) start = parse_set_item(source, info)
saved_pos1 = source.pos
if (not isinstance(start, Character) or not start.positive or not if (not isinstance(start, Character) or not start.positive or not
source.match("-")): source.match("-")):
# It's not the start of a range. # It's not the start of a range.
return start return start
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
# It looks like the start of a range of characters. # It looks like the start of a range of characters.
saved_pos = source.pos saved_pos2 = source.pos
if version == VERSION1 and source.match("-"):
# It's actually the set difference operator '--', so return the
# character.
source.pos = saved_pos1
return start
if source.match("]"): if source.match("]"):
# We've reached the end of the set, so return both the character and # We've reached the end of the set, so return both the character and
# hyphen. # hyphen.
source.pos = saved_pos source.pos = saved_pos2
return SetUnion(info, [start, Character(ord("-"))]) return SetUnion(info, [start, Character(ord("-"))])
# Parse a set item. # Parse a set item.
@ -1375,7 +1424,7 @@ def parse_set_member(source, info):
# It _is_ a range. # It _is_ a range.
if start.value > end.value: if start.value > end.value:
raise error("bad character range at position %d" % source.pos) raise error("bad character range", source.string, source.pos)
if start.value == end.value: if start.value == end.value:
return start return start
@ -1407,7 +1456,7 @@ def parse_set_item(source, info):
item = parse_set_union(source, info) item = parse_set_union(source, info)
if not source.match("]"): if not source.match("]"):
raise error("missing ] at position %d" % source.pos) raise error("missing ]", source.string, source.pos)
if negate: if negate:
item = item.with_flags(positive=not item.positive) item = item.with_flags(positive=not item.positive)
@ -1416,7 +1465,7 @@ def parse_set_item(source, info):
ch = source.get() ch = source.get()
if not ch: if not ch:
raise error("bad set at position %d" % source.pos, True) raise error("unterminated character set", source.string, source.pos)
return Character(ord(ch)) return Character(ord(ch))
@ -1427,7 +1476,7 @@ def parse_posix_class(source, info):
if not source.match(":]"): if not source.match(":]"):
raise ParseError() raise ParseError()
return lookup_property(prop_name, name, positive=not negate, source_pos=source.pos) return lookup_property(prop_name, name, not negate, source)
def float_to_rational(flt): def float_to_rational(flt):
"Converts a float to a rational pair." "Converts a float to a rational pair."
@ -1442,7 +1491,7 @@ def float_to_rational(flt):
def numeric_to_rational(numeric): def numeric_to_rational(numeric):
"Converts a numeric string to a rational string, if possible." "Converts a numeric string to a rational string, if possible."
if numeric[0] == "-": if numeric[ : 1] == "-":
sign, numeric = numeric[0], numeric[1 : ] sign, numeric = numeric[0], numeric[1 : ]
else: else:
sign = "" sign = ""
@ -1468,7 +1517,7 @@ def standardise_name(name):
except (ValueError, ZeroDivisionError): except (ValueError, ZeroDivisionError):
return "".join(ch for ch in name if ch not in "_- ").upper() return "".join(ch for ch in name if ch not in "_- ").upper()
def lookup_property(property, value, positive, source_pos=None): def lookup_property(property, value, positive, source=None):
"Looks up a property." "Looks up a property."
# Normalise the names (which may still be lists). # Normalise the names (which may still be lists).
property = standardise_name(property) if property else None property = standardise_name(property) if property else None
@ -1481,12 +1530,18 @@ def lookup_property(property, value, positive, source_pos=None):
# Both the property and the value are provided. # Both the property and the value are provided.
prop = PROPERTIES.get(property) prop = PROPERTIES.get(property)
if not prop: if not prop:
raise error("unknown property at position %d" % source_pos) if not source:
raise error("unknown property")
raise error("unknown property", source.string, source.pos)
prop_id, value_dict = prop prop_id, value_dict = prop
val_id = value_dict.get(value) val_id = value_dict.get(value)
if val_id is None: if val_id is None:
raise error("unknown property value at position %d" % source_pos) if not source:
raise error("unknown property value")
raise error("unknown property value", source.string, source.pos)
if "YES" in value_dict and val_id == 0: if "YES" in value_dict and val_id == 0:
positive, val_id = not positive, 1 positive, val_id = not positive, 1
@ -1526,7 +1581,10 @@ def lookup_property(property, value, positive, source_pos=None):
return Property((prop_id << 16) | val_id, positive) return Property((prop_id << 16) | val_id, positive)
# Unknown property. # Unknown property.
raise error("unknown property at position %d" % source_pos) if not source:
raise error("unknown property")
raise error("unknown property", source.string, source.pos)
def _compile_replacement(source, pattern, is_unicode): def _compile_replacement(source, pattern, is_unicode):
"Compiles a replacement template escape sequence." "Compiles a replacement template escape sequence."
@ -1539,7 +1597,7 @@ def _compile_replacement(source, pattern, is_unicode):
if ch in HEX_ESCAPES and (ch == "x" or is_unicode): if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
# A hexadecimal escape sequence. # A hexadecimal escape sequence.
return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch])] return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
if ch == "g": if ch == "g":
# A group preference. # A group preference.
@ -1595,18 +1653,19 @@ def _compile_replacement(source, pattern, is_unicode):
if not ch: if not ch:
# A trailing backslash. # A trailing backslash.
raise error("bad escape at position %d" % source.pos) raise error("bad escape (end of pattern)", source.string, source.pos)
# An escaped non-backslash is a backslash followed by the literal. # An escaped non-backslash is a backslash followed by the literal.
return False, [ord("\\"), ord(ch)] return False, [ord("\\"), ord(ch)]
def parse_repl_hex_escape(source, expected_len): def parse_repl_hex_escape(source, expected_len, type):
"Parses a hex escape sequence in a replacement string." "Parses a hex escape sequence in a replacement string."
digits = [] digits = []
for i in range(expected_len): for i in range(expected_len):
ch = source.get() ch = source.get()
if ch not in HEX_DIGITS: if ch not in HEX_DIGITS:
raise error("bad hex escape at position %d" % source.pos) raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
source.string, source.pos)
digits.append(ch) digits.append(ch)
return int("".join(digits), 16) return int("".join(digits), 16)
@ -1622,7 +1681,8 @@ def parse_repl_named_char(source):
value = unicodedata.lookup(name) value = unicodedata.lookup(name)
return ord(value) return ord(value)
except KeyError: except KeyError:
raise error("undefined character name at position %d" % source.pos) raise error("undefined character name", source.string,
source.pos)
source.pos = saved_pos source.pos = saved_pos
return None return None
@ -1630,13 +1690,13 @@ def parse_repl_named_char(source):
def compile_repl_group(source, pattern): def compile_repl_group(source, pattern):
"Compiles a replacement template group reference." "Compiles a replacement template group reference."
source.expect("<") source.expect("<")
name = parse_name(source, True) name = parse_name(source, True, True)
source.expect(">") source.expect(">")
if name.isdigit(): if name.isdigit():
index = int(name) index = int(name)
if not 0 <= index <= pattern.groups: if not 0 <= index <= pattern.groups:
raise error("invalid group at position %d" % source.pos) raise error("invalid group reference", source.string, source.pos)
return index return index
@ -1689,7 +1749,7 @@ class RegexBase(object):
return self.rebuild(positive, case_flags, zerowidth) return self.rebuild(positive, case_flags, zerowidth)
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
pass pass
def optimise(self, info): def optimise(self, info):
@ -1797,8 +1857,8 @@ class Atomic(RegexBase):
RegexBase.__init__(self) RegexBase.__init__(self)
self.subpattern = subpattern self.subpattern = subpattern
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
self.subpattern.fix_groups(reverse, fuzzy) self.subpattern.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
self.subpattern = self.subpattern.optimise(info) self.subpattern = self.subpattern.optimise(info)
@ -1857,9 +1917,9 @@ class Branch(RegexBase):
RegexBase.__init__(self) RegexBase.__init__(self)
self.branches = branches self.branches = branches
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
for b in self.branches: for b in self.branches:
b.fix_groups(reverse, fuzzy) b.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
# Flatten branches within branches. # Flatten branches within branches.
@ -2235,27 +2295,27 @@ class CallGroup(RegexBase):
self._key = self.__class__, self.group self._key = self.__class__, self.group
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
try: try:
self.group = int(self.group) self.group = int(self.group)
except ValueError: except ValueError:
try: try:
self.group = self.info.group_index[self.group] self.group = self.info.group_index[self.group]
except KeyError: except KeyError:
raise error("unknown group at position %d" % self.position) raise error("invalid group reference", pattern, self.position)
if not 0 <= self.group <= self.info.group_count: if not 0 <= self.group <= self.info.group_count:
raise error("unknown group at position %d" % self.position) raise error("unknown group", pattern, self.position)
if self.group > 0 and self.info.open_group_count[self.group] > 1: if self.group > 0 and self.info.open_group_count[self.group] > 1:
raise error("ambiguous group reference at position %d" % self.position) raise error("ambiguous group reference", pattern, self.position)
self.info.group_calls.append((self, reverse, fuzzy)) self.info.group_calls.append((self, reverse, fuzzy))
self._key = self.__class__, self.group self._key = self.__class__, self.group
def remove_captures(self): def remove_captures(self):
raise error("group reference not allowed at position %d" % self.position) raise error("group reference not allowed", pattern, self.position)
def _compile(self, reverse, fuzzy): def _compile(self, reverse, fuzzy):
return [(OP.GROUP_CALL, self.call_ref)] return [(OP.GROUP_CALL, self.call_ref)]
@ -2352,20 +2412,20 @@ class Conditional(RegexBase):
self.no_item = no_item self.no_item = no_item
self.position = position self.position = position
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
try: try:
self.group = int(self.group) self.group = int(self.group)
except ValueError: except ValueError:
try: try:
self.group = self.info.group_index[self.group] self.group = self.info.group_index[self.group]
except KeyError: except KeyError:
raise error("unknown group at position %d" % self.position) raise error("unknown group", pattern, self.position)
if not 1 <= self.group <= self.info.group_count: if not 1 <= self.group <= self.info.group_count:
raise error("unknown group at position %d" % self.position) raise error("invalid group reference", pattern, self.position)
self.yes_item.fix_groups(reverse, fuzzy) self.yes_item.fix_groups(pattern, reverse, fuzzy)
self.no_item.fix_groups(reverse, fuzzy) self.no_item.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
yes_item = self.yes_item.optimise(info) yes_item = self.yes_item.optimise(info)
@ -2496,8 +2556,8 @@ class Fuzzy(RegexBase):
constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
constraints["e"][1]} constraints["e"][1]}
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
self.subpattern.fix_groups(reverse, True) self.subpattern.fix_groups(pattern, reverse, True)
def pack_characters(self, info): def pack_characters(self, info):
self.subpattern = self.subpattern.pack_characters(info) self.subpattern = self.subpattern.pack_characters(info)
@ -2612,8 +2672,8 @@ class GreedyRepeat(RegexBase):
self.min_count = min_count self.min_count = min_count
self.max_count = max_count self.max_count = max_count
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
self.subpattern.fix_groups(reverse, fuzzy) self.subpattern.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
subpattern = self.subpattern.optimise(info) subpattern = self.subpattern.optimise(info)
@ -2700,9 +2760,9 @@ class Group(RegexBase):
self.call_ref = None self.call_ref = None
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
self.info.defined_groups[self.group] = (self, reverse, fuzzy) self.info.defined_groups[self.group] = (self, reverse, fuzzy)
self.subpattern.fix_groups(reverse, fuzzy) self.subpattern.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
subpattern = self.subpattern.optimise(info) subpattern = self.subpattern.optimise(info)
@ -2788,8 +2848,8 @@ class LookAround(RegexBase):
self.positive = bool(positive) self.positive = bool(positive)
self.subpattern = subpattern self.subpattern = subpattern
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
self.subpattern.fix_groups(self.behind, fuzzy) self.subpattern.fix_groups(pattern, self.behind, fuzzy)
def optimise(self, info): def optimise(self, info):
subpattern = self.subpattern.optimise(info) subpattern = self.subpattern.optimise(info)
@ -2982,22 +3042,22 @@ class RefGroup(RegexBase):
self._key = self.__class__, self.group, self.case_flags self._key = self.__class__, self.group, self.case_flags
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
try: try:
self.group = int(self.group) self.group = int(self.group)
except ValueError: except ValueError:
try: try:
self.group = self.info.group_index[self.group] self.group = self.info.group_index[self.group]
except KeyError: except KeyError:
raise error("unknown group at position %d" % self.position) raise error("unknown group", pattern, self.position)
if not 1 <= self.group <= self.info.group_count: if not 1 <= self.group <= self.info.group_count:
raise error("unknown group at position %d" % self.position) raise error("invalid group reference", pattern, self.position)
self._key = self.__class__, self.group, self.case_flags self._key = self.__class__, self.group, self.case_flags
def remove_captures(self): def remove_captures(self):
raise error("group reference not allowed at position %d" % self.position) raise error("group reference not allowed", pattern, self.position)
def _compile(self, reverse, fuzzy): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
@ -3024,9 +3084,9 @@ class Sequence(RegexBase):
self.items = items self.items = items
def fix_groups(self, reverse, fuzzy): def fix_groups(self, pattern, reverse, fuzzy):
for s in self.items: for s in self.items:
s.fix_groups(reverse, fuzzy) s.fix_groups(pattern, reverse, fuzzy)
def optimise(self, info): def optimise(self, info):
# Flatten the sequences. # Flatten the sequences.
@ -3208,7 +3268,7 @@ class SetBase(RegexBase):
print "%s%s %s%s" % (INDENT * indent, self._op_name, print "%s%s %s%s" % (INDENT * indent, self._op_name,
POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) POS_TEXT[self.positive], CASE_TEXT[self.case_flags])
for i in self.items: for i in self.items:
i.dump(indent + 1) i.dump(indent + 1, reverse)
def _handle_case_folding(self, info, in_set): def _handle_case_folding(self, info, in_set):
# Is the set case-sensitive? # Is the set case-sensitive?
@ -3494,9 +3554,9 @@ class String(RegexBase):
class Literal(String): class Literal(String):
def _dump(self, indent, reverse): def _dump(self, indent, reverse):
for c in self.characters: for c in self.characters:
display = ascii("".join(chr(c))).lstrip("bu") display = repr(unichr(c)).lstrip("bu")
print("{}CHARACTER MATCH {}{}".format(INDENT * indent, print "%sCHARACTER MATCH %s%s" % (INDENT * indent, display,
display, CASE_TEXT[self.case_flags])) CASE_TEXT[self.case_flags])
class StringSet(RegexBase): class StringSet(RegexBase):
_opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False): _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False):
@ -3792,7 +3852,7 @@ class Source(object):
def expect(self, substring): def expect(self, substring):
if not self.match(substring): if not self.match(substring):
raise error("missing %s at position %d" % (substring, self.pos)) raise error("missing %s" % substring, self.string, self.pos)
def at_end(self): def at_end(self):
string = self.string string = self.string
@ -3953,7 +4013,7 @@ class Scanner:
source.ignore_space = bool(info.flags & VERBOSE) source.ignore_space = bool(info.flags & VERBOSE)
parsed = _parse_pattern(source, info) parsed = _parse_pattern(source, info)
if not source.at_end(): if not source.at_end():
raise error("trailing characters at position %d" % source.pos) raise error("unbalanced parenthesis", source.string, source.pos)
# We want to forbid capture groups within each phrase. # We want to forbid capture groups within each phrase.
patterns.append(parsed.remove_captures()) patterns.append(parsed.remove_captures())
@ -3977,7 +4037,8 @@ class Scanner:
# Complain if there are any group calls. They are not supported by the # Complain if there are any group calls. They are not supported by the
# Scanner class. # Scanner class.
if info.call_refs: if info.call_refs:
raise error("recursive regex not supported by Scanner") raise error("recursive regex not supported by Scanner",
source.string, source.pos)
reverse = bool(info.flags & REVERSE) reverse = bool(info.flags & REVERSE)