Update regex module from upstream

This commit is contained in:
Kovid Goyal 2014-09-20 08:39:13 +05:30
parent 2e42bfa374
commit 4ffaba8e82
6 changed files with 13422 additions and 8996 deletions

View File

@ -225,31 +225,31 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
"Regex"] "Regex"]
__version__ = "2.4.39" __version__ = "2.4.48"
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# Public interface. # Public interface.
def match(pattern, string, flags=0, pos=None, endpos=None, concurrent=None, def match(pattern, string, flags=0, pos=None, endpos=None, partial=False,
**kwargs): concurrent=None, **kwargs):
"""Try to apply the pattern at the start of the string, returning a match """Try to apply the pattern at the start of the string, returning a match
object, or None if no match was found.""" object, or None if no match was found."""
return _compile(pattern, flags, kwargs).match(string, pos, endpos, return _compile(pattern, flags, kwargs).match(string, pos, endpos,
concurrent) concurrent, partial)
def fullmatch(pattern, string, flags=0, pos=None, endpos=None, concurrent=None, def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False,
**kwargs): concurrent=None, **kwargs):
"""Try to apply the pattern against all of the string, returning a match """Try to apply the pattern against all of the string, returning a match
object, or None if no match was found.""" object, or None if no match was found."""
return _compile(pattern, flags, kwargs).fullmatch(string, pos, endpos, return _compile(pattern, flags, kwargs).fullmatch(string, pos, endpos,
concurrent) concurrent, partial)
def search(pattern, string, flags=0, pos=None, endpos=None, concurrent=None, def search(pattern, string, flags=0, pos=None, endpos=None, partial=False,
**kwargs): concurrent=None, **kwargs):
"""Search through string looking for a match to the pattern, returning a """Search through string looking for a match to the pattern, returning a
match object, or None if no match was found.""" match object, or None if no match was found."""
return _compile(pattern, flags, kwargs).search(string, pos, endpos, return _compile(pattern, flags, kwargs).search(string, pos, endpos,
concurrent) concurrent, partial)
def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None,
concurrent=None, **kwargs): concurrent=None, **kwargs):
@ -319,12 +319,12 @@ def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
overlapped, concurrent) overlapped, concurrent)
def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
concurrent=None, **kwargs): partial=False, concurrent=None, **kwargs):
"""Return an iterator over all matches in the string. The matches may be """Return an iterator over all matches in the string. The matches may be
overlapped if overlapped is True. For each match, the iterator returns a overlapped if overlapped is True. For each match, the iterator returns a
match object. Empty matches are included in the result.""" match object. Empty matches are included in the result."""
return _compile(pattern, flags, kwargs).finditer(string, pos, endpos, return _compile(pattern, flags, kwargs).finditer(string, pos, endpos,
overlapped, concurrent) overlapped, concurrent, partial)
def compile(pattern, flags=0, **kwargs): def compile(pattern, flags=0, **kwargs):
"Compile a regular expression pattern, returning a pattern object." "Compile a regular expression pattern, returning a pattern object."
@ -392,6 +392,7 @@ from . import _regex_core
from calibre.constants import plugins from calibre.constants import plugins
_regex = plugins['_regex'][0] _regex = plugins['_regex'][0]
from threading import RLock as _RLock from threading import RLock as _RLock
from locale import getlocale as _getlocale
from ._regex_core import * from ._regex_core import *
from ._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError, from ._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError,
_UnscopedFlagSet, _check_group_features, _compile_firstset, _UnscopedFlagSet, _check_group_features, _compile_firstset,
@ -414,6 +415,7 @@ _cache = {}
_cache_lock = _RLock() _cache_lock = _RLock()
_named_args = {} _named_args = {}
_replacement_cache = {} _replacement_cache = {}
_locale_sensitive = {}
# Maximum size of the cache. # Maximum size of the cache.
_MAXCACHE = 500 _MAXCACHE = 500
@ -421,6 +423,15 @@ _MAXREPCACHE = 500
def _compile(pattern, flags=0, kwargs={}): def _compile(pattern, flags=0, kwargs={}):
"Compiles a regular expression to a PatternObject." "Compiles a regular expression to a PatternObject."
# What locale is this pattern using?
locale_key = (type(pattern), pattern)
if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
# This pattern is, or might be, locale-sensitive.
pattern_locale = _getlocale()
else:
# This pattern is definitely not locale-sensitive.
pattern_locale = None
try: try:
# Do we know what keyword arguments are needed? # Do we know what keyword arguments are needed?
args_key = pattern, type(pattern), flags args_key = pattern, type(pattern), flags
@ -433,13 +444,13 @@ def _compile(pattern, flags=0, kwargs={}):
try: try:
args_supplied.add((k, frozenset(kwargs[k]))) args_supplied.add((k, frozenset(kwargs[k])))
except KeyError: except KeyError:
raise error("missing named list") raise error("missing named list: {!r}".format(k))
args_supplied = frozenset(args_supplied) args_supplied = frozenset(args_supplied)
# Have we already seen this regular expression and named list? # Have we already seen this regular expression and named list?
pattern_key = (pattern, type(pattern), flags, args_supplied, pattern_key = (pattern, type(pattern), flags, args_supplied,
DEFAULT_VERSION) DEFAULT_VERSION, pattern_locale)
return _cache[pattern_key] return _cache[pattern_key]
except KeyError: except KeyError:
# It's a new pattern, or new named list for a known pattern. # It's a new pattern, or new named list for a known pattern.
@ -462,18 +473,19 @@ def _compile(pattern, flags=0, kwargs={}):
_regex_core.DEFAULT_VERSION = DEFAULT_VERSION _regex_core.DEFAULT_VERSION = DEFAULT_VERSION
caught_exception = None caught_exception = None
global_flags = flags
while True: while True:
try: try:
source = _Source(pattern) source = _Source(pattern)
info = _Info(flags, source.char_type, kwargs) info = _Info(global_flags, source.char_type, kwargs)
info.guess_encoding = guess_encoding info.guess_encoding = guess_encoding
source.ignore_space = bool(info.flags & VERBOSE) source.ignore_space = bool(info.flags & VERBOSE)
parsed = _parse_pattern(source, info) parsed = _parse_pattern(source, info)
break break
except _UnscopedFlagSet: except _UnscopedFlagSet:
# Remember the global flags for the next attempt. # Remember the global flags for the next attempt.
flags = info.global_flags global_flags = info.global_flags
except error, e: except error, e:
caught_exception = e caught_exception = e
@ -500,6 +512,9 @@ def _compile(pattern, flags=0, kwargs={}):
reverse = bool(info.flags & REVERSE) reverse = bool(info.flags & REVERSE)
fuzzy = isinstance(parsed, _Fuzzy) fuzzy = isinstance(parsed, _Fuzzy)
# Remember whether this pattern as an inline locale flag.
_locale_sensitive[locale_key] = info.inline_locale
# Should we print the parsed pattern? # Should we print the parsed pattern?
if flags & DEBUG: if flags & DEBUG:
parsed.dump(indent=0, reverse=reverse) parsed.dump(indent=0, reverse=reverse)
@ -583,7 +598,8 @@ def _compile(pattern, flags=0, kwargs={}):
args_needed = frozenset(args_needed) args_needed = frozenset(args_needed)
# Store this regular expression and named list. # Store this regular expression and named list.
pattern_key = (pattern, type(pattern), flags, args_needed, DEFAULT_VERSION) pattern_key = (pattern, type(pattern), flags, args_needed, DEFAULT_VERSION,
pattern_locale)
_cache[pattern_key] = compiled_pattern _cache[pattern_key] = compiled_pattern
# Store what keyword arguments are needed. # Store what keyword arguments are needed.

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@
* 2010-01-16 mrab Re-written * 2010-01-16 mrab Re-written
*/ */
/* Supports Unicode version 6.3.0. */ /* Supports Unicode version 7.0.0. */
#define RE_MAGIC 20100116 #define RE_MAGIC 20100116

View File

@ -14,7 +14,6 @@
# 2010-01-16 mrab Python front-end re-written and extended # 2010-01-16 mrab Python front-end re-written and extended
import string import string
import sys
import unicodedata import unicodedata
from collections import defaultdict from collections import defaultdict
@ -23,6 +22,7 @@ _regex = plugins['_regex'][0]
if _regex is None: if _regex is None:
raise RuntimeError('Failed to load regex module with error: ' + plugins['_regex'][1]) raise RuntimeError('Failed to load regex module with error: ' + plugins['_regex'][1])
__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
"F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "R", "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "R",
"REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0",
@ -114,6 +114,7 @@ HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
# A singleton which indicates a comment within a pattern. # A singleton which indicates a comment within a pattern.
COMMENT = object() COMMENT = object()
FLAGS = object()
# The names of the opcodes. # The names of the opcodes.
OPCODES = """ OPCODES = """
@ -248,8 +249,8 @@ def _shrink_cache(cache_dict, args_dict, max_length, divisor=5):
# Rebuild the arguments dictionary. # Rebuild the arguments dictionary.
args_dict.clear() args_dict.clear()
for pattern, pattern_type, flags, args, default_version in cache_dict: for pattern, pattern_type, flags, args, default_version, locale in cache_dict:
args_dict[pattern, pattern_type, flags, default_version] = args args_dict[pattern, pattern_type, flags, default_version, locale] = args
def _fold_case(info, string): def _fold_case(info, string):
"Folds the case of a string." "Folds the case of a string."
@ -331,22 +332,63 @@ def _parse_pattern(source, info):
def parse_sequence(source, info): def parse_sequence(source, info):
"Parses a sequence, eg. 'abc'." "Parses a sequence, eg. 'abc'."
sequence = [] sequence = []
item = parse_item(source, info) applied = False
while item: while True:
sequence.append(item) # Get literal characters followed by an element.
item = parse_item(source, info) characters, case_flags, element = parse_literal_and_element(source,
info)
if not element:
# No element, just a literal. We've also reached the end of the
# sequence.
append_literal(characters, case_flags, sequence)
break
if element is COMMENT or element is FLAGS:
append_literal(characters, case_flags, sequence)
elif type(element) is tuple:
# It looks like we've found a quantifier.
ch, saved_pos = element
counts = parse_quantifier(source, info, ch)
if counts:
# It _is_ a quantifier.
apply_quantifier(source, info, counts, characters, case_flags,
ch, saved_pos, applied, sequence)
applied = True
else:
# It's not a quantifier. Maybe it's a fuzzy constraint.
constraints = parse_fuzzy(source, ch)
if constraints:
# It _is_ a fuzzy constraint.
apply_constraint(source, info, constraints, characters,
case_flags, saved_pos, applied, sequence)
applied = True
else:
# The element was just a literal.
characters.append(ord(ch))
append_literal(characters, case_flags, sequence)
applied = False
else:
# We have a literal followed by something else.
append_literal(characters, case_flags, sequence)
sequence.append(element)
applied = False
return make_sequence(sequence) return make_sequence(sequence)
def PossessiveRepeat(element, min_count, max_count): def apply_quantifier(source, info, counts, characters, case_flags, ch,
"Builds a possessive repeat." saved_pos, applied, sequence):
return Atomic(GreedyRepeat(element, min_count, max_count)) if characters:
# The quantifier applies to the last character.
append_literal(characters[ : -1], case_flags, sequence)
element = Character(characters[-1], case_flags=case_flags)
else:
# The quantifier applies to the last item in the sequence.
if applied or not sequence:
raise error("nothing to repeat at position %d" % saved_pos)
element = sequence.pop()
def parse_item(source, info):
"Parses an item, which might be repeated. Returns None if there's no item."
element = parse_element(source, info)
counts = parse_quantifier(source, info)
if counts:
min_count, max_count = counts min_count, max_count = counts
saved_pos = source.pos saved_pos = source.pos
ch = source.get() ch = source.get()
@ -361,51 +403,58 @@ def parse_item(source, info):
source.pos = saved_pos source.pos = saved_pos
repeated = GreedyRepeat repeated = GreedyRepeat
if element.is_empty() or min_count == max_count == 1: # Ignore the quantifier if it applies to a zero-width item or the number of
return element # repeats is fixed at 1.
if not element.is_empty() and (min_count != 1 or max_count != 1):
element = repeated(element, min_count, max_count)
return repeated(element, min_count, max_count) sequence.append(element)
# No quantifier, but maybe there's a fuzzy constraint. def apply_constraint(source, info, constraints, characters, case_flags,
constraints = parse_fuzzy(source) saved_pos, applied, sequence):
if not constraints: if characters:
# No fuzzy constraint. # The constraint applies to the last character.
return element append_literal(characters[ : -1], case_flags, sequence)
element = Character(characters[-1], case_flags=case_flags)
sequence.append(Fuzzy(element, constraints))
else:
# The constraint applies to the last item in the sequence.
if applied or not sequence:
raise error("nothing for fuzzy constraint at position %d" % saved_pos)
element = sequence.pop()
# If a group is marked as fuzzy then put all of the fuzzy part in the # If a group is marked as fuzzy then put all of the fuzzy part in the
# group. # group.
if isinstance(element, Group): if isinstance(element, Group):
element.subpattern = Fuzzy(element.subpattern, constraints) element.subpattern = Fuzzy(element.subpattern, constraints)
return element sequence.append(element)
else:
sequence.append(Fuzzy(element, constraints))
return Fuzzy(element, constraints) def append_literal(characters, case_flags, sequence):
if characters:
sequence.append(Literal(characters, case_flags=case_flags))
def PossessiveRepeat(element, min_count, max_count):
"Builds a possessive repeat."
return Atomic(GreedyRepeat(element, min_count, max_count))
_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} _QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
def parse_quantifier(source, info): def parse_quantifier(source, info, ch):
"Parses a quantifier." "Parses a quantifier."
while True:
saved_pos = source.pos
ch = source.get()
q = _QUANTIFIERS.get(ch) q = _QUANTIFIERS.get(ch)
if q: if q:
# It's a quantifier. # It's a quantifier.
return q return q
if ch == "{": if ch == "{":
# Looks like a limited repeated element, eg. 'a{2,3}'. # Looks like a limited repeated element, eg. 'a{2,3}'.
counts = parse_limited_quantifier(source) counts = parse_limited_quantifier(source)
if counts: if counts:
return counts return counts
elif ch == "(" and source.match("?#"):
# A comment.
parse_comment(source)
continue
# Neither a quantifier nor a comment.
break
# Parse it later, perhaps as a literal.
source.pos = saved_pos
return None return None
def is_above_limit(count): def is_above_limit(count):
@ -441,13 +490,13 @@ def parse_limited_quantifier(source):
return min_count, max_count return min_count, max_count
def parse_fuzzy(source): def parse_fuzzy(source, ch):
"Parses a fuzzy setting, if present." "Parses a fuzzy setting, if present."
saved_pos = source.pos if ch != "{":
if not source.match("{"):
source.pos = saved_pos
return None return None
saved_pos = source.pos
constraints = {} constraints = {}
try: try:
parse_fuzzy_item(source, constraints) parse_fuzzy_item(source, constraints)
@ -455,7 +504,6 @@ def parse_fuzzy(source):
parse_fuzzy_item(source, constraints) parse_fuzzy_item(source, constraints)
except ParseError: except ParseError:
source.pos = saved_pos source.pos = saved_pos
return None return None
if not source.match("}"): if not source.match("}"):
@ -597,10 +645,12 @@ def parse_count(source):
"Parses a quantifier's count, which can be empty." "Parses a quantifier's count, which can be empty."
return source.get_while(DIGITS) return source.get_while(DIGITS)
def parse_element(source, info): def parse_literal_and_element(source, info):
"""Parses an element. An element might actually be a flag, eg. '(?i)', in """Parses a literal followed by an element. The element is FLAGS if it's an
which case it returns None. inline flag or None if it has reached the end of a sequence.
""" """
characters = []
case_flags = info.flags & CASE_FLAGS
while True: while True:
saved_pos = source.pos saved_pos = source.pos
ch = source.get() ch = source.get()
@ -608,71 +658,69 @@ def parse_element(source, info):
if ch in ")|": if ch in ")|":
# The end of a sequence. At the end of the pattern ch is "". # The end of a sequence. At the end of the pattern ch is "".
source.pos = saved_pos source.pos = saved_pos
return None return characters, case_flags, None
elif ch == "\\": elif ch == "\\":
# An escape sequence outside a set. # An escape sequence outside a set.
return parse_escape(source, info, False) element = parse_escape(source, info, False)
return characters, case_flags, element
elif ch == "(": elif ch == "(":
# A parenthesised subpattern or a flag. # A parenthesised subpattern or a flag.
element = parse_paren(source, info) element = parse_paren(source, info)
if element and element is not COMMENT: if element and element is not COMMENT:
return element return characters, case_flags, element
elif ch == ".": elif ch == ".":
# Any character. # Any character.
if info.flags & DOTALL: if info.flags & DOTALL:
return AnyAll() element = AnyAll()
elif info.flags & WORD: elif info.flags & WORD:
return AnyU() element = AnyU()
else: else:
return Any() element = Any()
return characters, case_flags, element
elif ch == "[": elif ch == "[":
# A character set. # A character set.
return parse_set(source, info) element = parse_set(source, info)
return characters, case_flags, element
elif ch == "^": elif ch == "^":
# The start of a line or the string. # The start of a line or the string.
if info.flags & MULTILINE: if info.flags & MULTILINE:
if info.flags & WORD: if info.flags & WORD:
return StartOfLineU() element = StartOfLineU()
else: else:
return StartOfLine() element = StartOfLine()
else: else:
return StartOfString() element = StartOfString()
return characters, case_flags, element
elif ch == "$": elif ch == "$":
# The end of a line or the string. # The end of a line or the string.
if info.flags & MULTILINE: if info.flags & MULTILINE:
if info.flags & WORD: if info.flags & WORD:
return EndOfLineU() element = EndOfLineU()
else: else:
return EndOfLine() element = EndOfLine()
else: else:
if info.flags & WORD: if info.flags & WORD:
return EndOfStringLineU() element = EndOfStringLineU()
else: else:
return EndOfStringLine() element = EndOfStringLine()
elif ch == "{":
# Looks like a limited quantifier.
saved_pos_2 = source.pos
source.pos = saved_pos
counts = parse_quantifier(source, info)
if counts:
# A quantifier where we expected an element.
raise error("nothing to repeat at position %d" % saved_pos_2)
# Not a quantifier, so it's a literal. return characters, case_flags, element
source.pos = saved_pos_2 elif ch in "?*+{":
return make_character(info, ord(ch)) # Looks like a quantifier.
elif ch in "?*+": return characters, case_flags, (ch, saved_pos)
# A quantifier where we expected an element.
raise error("nothing to repeat at position %d" % saved_pos)
else: else:
# A literal. # A literal.
return make_character(info, ord(ch)) characters.append(ord(ch))
else: else:
# A literal. # A literal.
return make_character(info, ord(ch)) characters.append(ord(ch))
def parse_paren(source, info): def parse_paren(source, info):
"Parses a parenthesised subpattern or a flag." """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
inline flag.
"""
saved_pos = source.pos saved_pos = source.pos
ch = source.get() ch = source.get()
if ch == "?": if ch == "?":
@ -897,6 +945,10 @@ def parse_flags(source, info):
else: else:
flags_off = 0 flags_off = 0
if flags_on & LOCALE:
# Remember that this pattern as an inline locale flag.
info.inline_locale = True
return flags_on, flags_off return flags_on, flags_off
def parse_subpattern(source, info, flags_on, flags_off): def parse_subpattern(source, info, flags_on, flags_off):
@ -913,30 +965,10 @@ def parse_subpattern(source, info, flags_on, flags_off):
return subpattern return subpattern
def parse_positional_flags(source, info, flags_on, flags_off):
"Parses positional flags."
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
if version == VERSION0:
# Positional flags are global and can only be turned on.
if flags_off:
raise error("bad inline flags: can't turn flags off at position %d" % source.pos)
new_global_flags = flags_on & ~info.global_flags
if new_global_flags:
info.global_flags |= new_global_flags
# A global has been turned on, so reparse the pattern.
raise _UnscopedFlagSet(info.global_flags)
else:
info.flags = (info.flags | flags_on) & ~flags_off
source.ignore_space = bool(info.flags & VERBOSE)
return None
def parse_flags_subpattern(source, info): def parse_flags_subpattern(source, info):
"""Parses a flags subpattern. It could be inline flags or a subpattern """Parses a flags subpattern. It could be inline flags or a subpattern
possibly with local flags. possibly with local flags. If it's a subpattern, then that's returned;
if it's a inline flags, then FLAGS is returned.
""" """
flags_on, flags_off = parse_flags(source, info) flags_on, flags_off = parse_flags(source, info)
@ -961,10 +993,30 @@ def parse_flags_subpattern(source, info):
return parse_subpattern(source, info, flags_on, flags_off) return parse_subpattern(source, info, flags_on, flags_off)
if source.match(")"): if source.match(")"):
return parse_positional_flags(source, info, flags_on, flags_off) parse_positional_flags(source, info, flags_on, flags_off)
return FLAGS
raise error("unknown extension at position %d" % source.pos) raise error("unknown extension at position %d" % source.pos)
def parse_positional_flags(source, info, flags_on, flags_off):
"Parses positional flags."
version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
if version == VERSION0:
# Positional flags are global and can only be turned on.
if flags_off:
raise error("bad inline flags: can't turn flags off at position %d" % source.pos)
new_global_flags = flags_on & ~info.global_flags
if new_global_flags:
info.global_flags |= new_global_flags
# A global has been turned on, so reparse the pattern.
raise _UnscopedFlagSet(info.global_flags)
else:
info.flags = (info.flags | flags_on) & ~flags_off
source.ignore_space = bool(info.flags & VERBOSE)
def parse_name(source, allow_numeric=False): def parse_name(source, allow_numeric=False):
"Parses a name." "Parses a name."
name = source.get_while(set(")>"), include=False) name = source.get_while(set(")>"), include=False)
@ -1175,12 +1227,12 @@ def parse_property(source, info, positive, in_set):
prop_name, name = parse_property_name(source) prop_name, name = parse_property_name(source)
if source.match("}"): if source.match("}"):
# It's correctly delimited. # It's correctly delimited.
prop = lookup_property(prop_name, name, positive != negate) prop = lookup_property(prop_name, name, positive != negate, source_pos=source.pos)
return make_property(info, prop, in_set) return make_property(info, prop, in_set)
elif ch and ch in "CLMNPSZ": elif ch and ch in "CLMNPSZ":
# An abbreviated property, eg \pL. # An abbreviated property, eg \pL.
prop = lookup_property(None, ch, positive) prop = lookup_property(None, ch, positive)
return make_property(info, prop, in_set) return make_property(info, prop, in_set, source_pos=source.pos)
# Not a property, so treat as a literal "p" or "P". # Not a property, so treat as a literal "p" or "P".
source.pos = saved_pos source.pos = saved_pos
@ -1375,7 +1427,7 @@ def parse_posix_class(source, info):
if not source.match(":]"): if not source.match(":]"):
raise ParseError() raise ParseError()
return lookup_property(prop_name, name, positive=not negate) return lookup_property(prop_name, name, positive=not negate, source_pos=source.pos)
def float_to_rational(flt): def float_to_rational(flt):
"Converts a float to a rational pair." "Converts a float to a rational pair."
@ -1416,21 +1468,25 @@ def standardise_name(name):
except (ValueError, ZeroDivisionError): except (ValueError, ZeroDivisionError):
return "".join(ch for ch in name if ch not in "_- ").upper() return "".join(ch for ch in name if ch not in "_- ").upper()
def lookup_property(property, value, positive): def lookup_property(property, value, positive, source_pos=None):
"Looks up a property." "Looks up a property."
# Normalise the names (which may still be lists). # Normalise the names (which may still be lists).
property = standardise_name(property) if property else None property = standardise_name(property) if property else None
value = standardise_name(value) value = standardise_name(value)
if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
if property: if property:
# Both the property and the value are provided. # Both the property and the value are provided.
prop = PROPERTIES.get(property) prop = PROPERTIES.get(property)
if not prop: if not prop:
raise error("unknown property at position %d" % source.pos) raise error("unknown property at position %d" % source_pos)
prop_id, value_dict = prop prop_id, value_dict = prop
val_id = value_dict.get(value) val_id = value_dict.get(value)
if val_id is None: if val_id is None:
raise error("unknown property value at position %d" % source.pos) raise error("unknown property value at position %d" % source_pos)
if "YES" in value_dict and val_id == 0: if "YES" in value_dict and val_id == 0:
positive, val_id = not positive, 1 positive, val_id = not positive, 1
@ -1470,7 +1526,7 @@ def lookup_property(property, value, positive):
return Property((prop_id << 16) | val_id, positive) return Property((prop_id << 16) | val_id, positive)
# Unknown property. # Unknown property.
raise error("unknown property at position %d" % source.pos) raise error("unknown property at position %d" % source_pos)
def _compile_replacement(source, pattern, is_unicode): def _compile_replacement(source, pattern, is_unicode):
"Compiles a replacement template escape sequence." "Compiles a replacement template escape sequence."
@ -1660,6 +1716,12 @@ class RegexBase(object):
def has_simple_start(self): def has_simple_start(self):
return False return False
def compile(self, reverse=False, fuzzy=False):
return self._compile(reverse, fuzzy)
def dump(self, indent, reverse):
self._dump(indent, reverse)
def is_empty(self): def is_empty(self):
return False return False
@ -1686,7 +1748,7 @@ class ZeroWidthBase(RegexBase):
def get_firstset(self, reverse): def get_firstset(self, reverse):
return set([None]) return set([None])
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -1696,7 +1758,7 @@ class ZeroWidthBase(RegexBase):
flags |= REVERSE_OP flags |= REVERSE_OP
return [(self._opcode, flags)] return [(self._opcode, flags)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%s%s %s" % (INDENT * indent, self._op_name, print "%s%s %s" % (INDENT * indent, self._op_name,
POS_TEXT[self.positive]) POS_TEXT[self.positive])
@ -1710,13 +1772,13 @@ class Any(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return True return True
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if fuzzy: if fuzzy:
flags |= FUZZY_OP flags |= FUZZY_OP
return [(self._opcode[reverse], flags)] return [(self._opcode[reverse], flags)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%s%s" % (INDENT * indent, self._op_name) print "%s%s" % (INDENT * indent, self._op_name)
def max_width(self): def max_width(self):
@ -1765,11 +1827,11 @@ class Atomic(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return self.subpattern.has_simple_start() return self.subpattern.has_simple_start()
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
[(OP.END, )]) [(OP.END, )])
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sATOMIC" % (INDENT * indent) print "%sATOMIC" % (INDENT * indent)
self.subpattern.dump(indent + 1, reverse) self.subpattern.dump(indent + 1, reverse)
@ -1822,6 +1884,20 @@ class Branch(RegexBase):
return make_sequence(sequence) return make_sequence(sequence)
def optimise(self, info):
# Flatten branches within branches.
branches = Branch._flatten_branches(info, self.branches)
# Try to reduce adjacent single-character branches to sets.
branches = Branch._reduce_to_set(info, branches)
if len(branches) > 1:
sequence = [Branch(branches)]
else:
sequence = branches
return make_sequence(sequence)
def pack_characters(self, info): def pack_characters(self, info):
self.branches = [b.pack_characters(info) for b in self.branches] self.branches = [b.pack_characters(info) for b in self.branches]
return self return self
@ -1846,7 +1922,7 @@ class Branch(RegexBase):
return fs or set([None]) return fs or set([None])
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
code = [(OP.BRANCH, )] code = [(OP.BRANCH, )]
for b in self.branches: for b in self.branches:
code.extend(b.compile(reverse, fuzzy)) code.extend(b.compile(reverse, fuzzy))
@ -1856,7 +1932,7 @@ class Branch(RegexBase):
return code return code
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sBRANCH" % (INDENT * indent) print "%sBRANCH" % (INDENT * indent)
self.branches[0].dump(indent + 1, reverse) self.branches[0].dump(indent + 1, reverse)
for b in self.branches[1 : ]: for b in self.branches[1 : ]:
@ -2181,10 +2257,10 @@ class CallGroup(RegexBase):
def remove_captures(self): def remove_captures(self):
raise error("group reference not allowed at position %d" % self.position) raise error("group reference not allowed at position %d" % self.position)
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
return [(OP.GROUP_CALL, self.call_ref)] return [(OP.GROUP_CALL, self.call_ref)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sGROUP_CALL %s" % (INDENT * indent, self.group) print "%sGROUP_CALL %s" % (INDENT * indent, self.group)
def __eq__(self, other): def __eq__(self, other):
@ -2229,7 +2305,7 @@ class Character(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return True return True
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -2248,7 +2324,7 @@ class Character(RegexBase):
return code.compile(reverse, fuzzy) return code.compile(reverse, fuzzy)
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
display = repr(unichr(self.value)).lstrip("bu") display = repr(unichr(self.value)).lstrip("bu")
print "%sCHARACTER %s %s%s" % (INDENT * indent, print "%sCHARACTER %s %s%s" % (INDENT * indent,
POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]) POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])
@ -2319,7 +2395,7 @@ class Conditional(RegexBase):
return (self.yes_item.get_firstset(reverse) | return (self.yes_item.get_firstset(reverse) |
self.no_item.get_firstset(reverse)) self.no_item.get_firstset(reverse))
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
code = [(OP.GROUP_EXISTS, self.group)] code = [(OP.GROUP_EXISTS, self.group)]
code.extend(self.yes_item.compile(reverse, fuzzy)) code.extend(self.yes_item.compile(reverse, fuzzy))
add_code = self.no_item.compile(reverse, fuzzy) add_code = self.no_item.compile(reverse, fuzzy)
@ -2331,7 +2407,7 @@ class Conditional(RegexBase):
return code return code
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group) print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group)
self.yes_item.dump(indent + 1, reverse) self.yes_item.dump(indent + 1, reverse)
if self.no_item: if self.no_item:
@ -2437,7 +2513,7 @@ class Fuzzy(RegexBase):
def contains_group(self): def contains_group(self):
return self.subpattern.contains_group() return self.subpattern.contains_group()
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
# The individual limits. # The individual limits.
arguments = [] arguments = []
for e in "dise": for e in "dise":
@ -2460,7 +2536,7 @@ class Fuzzy(RegexBase):
return ([(OP.FUZZY, flags) + tuple(arguments)] + return ([(OP.FUZZY, flags) + tuple(arguments)] +
self.subpattern.compile(reverse, True) + [(OP.END,)]) self.subpattern.compile(reverse, True) + [(OP.END,)])
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
constraints = self._constraints_to_string() constraints = self._constraints_to_string()
if constraints: if constraints:
constraints = " " + constraints constraints = " " + constraints
@ -2511,7 +2587,7 @@ class Fuzzy(RegexBase):
return ",".join(constraints) return ",".join(constraints)
class Grapheme(RegexBase): class Grapheme(RegexBase):
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
# Match at least 1 character until a grapheme boundary is reached. Note # Match at least 1 character until a grapheme boundary is reached. Note
# that this is the same whether matching forwards or backwards. # that this is the same whether matching forwards or backwards.
character_matcher = LazyRepeat(AnyAll(), 1, None).compile(reverse, character_matcher = LazyRepeat(AnyAll(), 1, None).compile(reverse,
@ -2520,7 +2596,7 @@ class Grapheme(RegexBase):
return character_matcher + boundary_matcher return character_matcher + boundary_matcher
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sGRAPHEME" % (INDENT * indent) print "%sGRAPHEME" % (INDENT * indent)
def max_width(self): def max_width(self):
@ -2565,7 +2641,7 @@ class GreedyRepeat(RegexBase):
return fs return fs
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
repeat = [self._opcode, self.min_count] repeat = [self._opcode, self.min_count]
if self.max_count is None: if self.max_count is None:
repeat.append(UNLIMITED) repeat.append(UNLIMITED)
@ -2578,7 +2654,7 @@ class GreedyRepeat(RegexBase):
return ([tuple(repeat)] + subpattern + [(OP.END, )]) return ([tuple(repeat)] + subpattern + [(OP.END, )])
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
if self.max_count is None: if self.max_count is None:
limit = "INF" limit = "INF"
else: else:
@ -2655,7 +2731,7 @@ class Group(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return self.subpattern.has_simple_start() return self.subpattern.has_simple_start()
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
code = [] code = []
key = self.group, reverse, fuzzy key = self.group, reverse, fuzzy
@ -2676,7 +2752,7 @@ class Group(RegexBase):
return code return code
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
group = self.group group = self.group
if group < 0: if group < 0:
group = private_groups[group] group = private_groups[group]
@ -2736,11 +2812,11 @@ class LookAround(RegexBase):
def contains_group(self): def contains_group(self):
return self.subpattern.contains_group() return self.subpattern.contains_group()
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] + return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] +
self.subpattern.compile(self.behind) + [(OP.END, )]) self.subpattern.compile(self.behind) + [(OP.END, )])
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind], print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind],
POS_TEXT[self.positive]) POS_TEXT[self.positive])
self.subpattern.dump(indent + 1, self.behind) self.subpattern.dump(indent + 1, self.behind)
@ -2759,7 +2835,7 @@ class PrecompiledCode(RegexBase):
def __init__(self, code): def __init__(self, code):
self.code = code self.code = code
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
return [tuple(self.code)] return [tuple(self.code)]
class Property(RegexBase): class Property(RegexBase):
@ -2792,7 +2868,7 @@ class Property(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return True return True
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -2802,7 +2878,7 @@ class Property(RegexBase):
flags |= FUZZY_OP flags |= FUZZY_OP
return [(self._opcode[self.case_flags, reverse], flags, self.value)] return [(self._opcode[self.case_flags, reverse], flags, self.value)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
prop = PROPERTY_NAMES[self.value >> 16] prop = PROPERTY_NAMES[self.value >> 16]
name, value = prop[0], prop[1][self.value & 0xFFFF] name, value = prop[0], prop[1][self.value & 0xFFFF]
print "%sPROPERTY %s %s:%s%s" % (INDENT * indent, print "%sPROPERTY %s %s:%s%s" % (INDENT * indent,
@ -2867,7 +2943,7 @@ class Range(RegexBase):
return Branch(items) return Branch(items)
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -2878,7 +2954,7 @@ class Range(RegexBase):
return [(self._opcode[self.case_flags, reverse], flags, self.lower, return [(self._opcode[self.case_flags, reverse], flags, self.lower,
self.upper)] self.upper)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
display_lower = repr(unichr(self.lower)).lstrip("bu") display_lower = repr(unichr(self.lower)).lstrip("bu")
display_upper = repr(unichr(self.upper)).lstrip("bu") display_upper = repr(unichr(self.upper)).lstrip("bu")
print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive], print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive],
@ -2923,13 +2999,13 @@ class RefGroup(RegexBase):
def remove_captures(self): def remove_captures(self):
raise error("group reference not allowed at position %d" % self.position) raise error("group reference not allowed at position %d" % self.position)
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if fuzzy: if fuzzy:
flags |= FUZZY_OP flags |= FUZZY_OP
return [(self._opcode[self.case_flags, reverse], flags, self.group)] return [(self._opcode[self.case_flags, reverse], flags, self.group)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sREF_GROUP %s%s" % (INDENT * indent, self.group, print "%sREF_GROUP %s%s" % (INDENT * indent, self.group,
CASE_TEXT[self.case_flags]) CASE_TEXT[self.case_flags])
@ -2974,18 +3050,18 @@ class Sequence(RegexBase):
if s.case_flags != case_flags: if s.case_flags != case_flags:
# Different case sensitivity, so flush, unless neither the # Different case sensitivity, so flush, unless neither the
# previous nor the new character are cased. # previous nor the new character are cased.
if case_flags or is_cased(info, s.value): if s.case_flags or is_cased(info, s.value):
Sequence._flush_characters(info, characters, Sequence._flush_characters(info, characters,
case_flags, items) case_flags, items)
case_flags = s.case_flags case_flags = s.case_flags
characters.append(s.value) characters.append(s.value)
elif type(s) is String: elif type(s) is String or type(s) is Literal:
if s.case_flags != case_flags: if s.case_flags != case_flags:
# Different case sensitivity, so flush, unless the neither # Different case sensitivity, so flush, unless the neither
# the previous nor the new string are cased. # the previous nor the new string are cased.
if not s.case_flags or any(is_cased(info, c) for c in if s.case_flags or any(is_cased(info, c) for c in
characters): characters):
Sequence._flush_characters(info, characters, Sequence._flush_characters(info, characters,
case_flags, items) case_flags, items)
@ -3031,7 +3107,7 @@ class Sequence(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return self.items and self.items[0].has_simple_start() return self.items and self.items[0].has_simple_start()
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
seq = self.items seq = self.items
if reverse: if reverse:
seq = seq[::-1] seq = seq[::-1]
@ -3042,7 +3118,7 @@ class Sequence(RegexBase):
return code return code
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
for s in self.items: for s in self.items:
s.dump(indent, reverse) s.dump(indent, reverse)
@ -3112,7 +3188,7 @@ class SetBase(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return True return True
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -3128,7 +3204,7 @@ class SetBase(RegexBase):
return code return code
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%s%s %s%s" % (INDENT * indent, self._op_name, print "%s%s %s%s" % (INDENT * indent, self._op_name,
POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) POS_TEXT[self.positive], CASE_TEXT[self.case_flags])
for i in self.items: for i in self.items:
@ -3306,7 +3382,7 @@ class SetUnion(SetBase):
return self._handle_case_folding(info, in_set) return self._handle_case_folding(info, in_set)
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if self.positive: if self.positive:
flags |= POSITIVE_OP flags |= POSITIVE_OP
@ -3395,7 +3471,7 @@ class String(RegexBase):
def has_simple_start(self): def has_simple_start(self):
return True return True
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
flags = 0 flags = 0
if fuzzy: if fuzzy:
flags |= FUZZY_OP flags |= FUZZY_OP
@ -3404,7 +3480,7 @@ class String(RegexBase):
return [(self._opcode[self.case_flags, reverse], flags, return [(self._opcode[self.case_flags, reverse], flags,
len(self.folded_characters)) + self.folded_characters] len(self.folded_characters)) + self.folded_characters]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu") display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu")
print "%sSTRING %s%s" % (INDENT * indent, display, print "%sSTRING %s%s" % (INDENT * indent, display,
CASE_TEXT[self.case_flags]) CASE_TEXT[self.case_flags])
@ -3415,6 +3491,13 @@ class String(RegexBase):
def get_required_string(self, reverse): def get_required_string(self, reverse):
return 0, self return 0, self
class Literal(String):
def _dump(self, indent, reverse):
for c in self.characters:
display = ascii("".join(chr(c))).lstrip("bu")
print("{}CHARACTER MATCH {}{}".format(INDENT * indent,
display, CASE_TEXT[self.case_flags]))
class StringSet(RegexBase): class StringSet(RegexBase):
_opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False): _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False):
OP.STRING_SET_IGN, (FULLCASE, False): OP.STRING_SET, (FULLIGNORECASE, OP.STRING_SET_IGN, (FULLCASE, False): OP.STRING_SET, (FULLIGNORECASE,
@ -3433,7 +3516,7 @@ class StringSet(RegexBase):
if self.set_key not in info.named_lists_used: if self.set_key not in info.named_lists_used:
info.named_lists_used[self.set_key] = len(info.named_lists_used) info.named_lists_used[self.set_key] = len(info.named_lists_used)
def compile(self, reverse=False, fuzzy=False): def _compile(self, reverse, fuzzy):
index = self.info.named_lists_used[self.set_key] index = self.info.named_lists_used[self.set_key]
items = self.info.kwargs[self.name] items = self.info.kwargs[self.name]
@ -3469,7 +3552,7 @@ class StringSet(RegexBase):
return [(self._opcode[case_flags, reverse], index, min_len, return [(self._opcode[case_flags, reverse], index, min_len,
max_len)] max_len)]
def dump(self, indent=0, reverse=False): def _dump(self, indent, reverse):
print "%sSTRING_SET %s%s" % (INDENT * indent, self.name, print "%sSTRING_SET %s%s" % (INDENT * indent, self.name,
CASE_TEXT[self.case_flags]) CASE_TEXT[self.case_flags])
@ -3740,6 +3823,7 @@ class Info(object):
flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
self.flags = flags self.flags = flags
self.global_flags = flags self.global_flags = flags
self.inline_locale = False
self.kwargs = kwargs self.kwargs = kwargs
@ -3799,8 +3883,8 @@ class Info(object):
def _check_group_features(info, parsed): def _check_group_features(info, parsed):
"""Checks whether the reverse and fuzzy features of the group calls match """Checks whether the reverse and fuzzy features of the group calls match
the groups which they call.""" the groups which they call.
"""
call_refs = {} call_refs = {}
additional_groups = [] additional_groups = []
for call, reverse, fuzzy in info.group_calls: for call, reverse, fuzzy in info.group_calls:
@ -3976,12 +4060,12 @@ CHARACTER_ESCAPES = {
# Predefined character set escape sequences. # Predefined character set escape sequences.
CHARSET_ESCAPES = { CHARSET_ESCAPES = {
"d": lookup_property(None, "DIGIT", True), "d": lookup_property(None, "Digit", True),
"D": lookup_property(None, "DIGIT", False), "D": lookup_property(None, "Digit", False),
"s": lookup_property(None, "SPACE", True), "s": lookup_property(None, "Space", True),
"S": lookup_property(None, "SPACE", False), "S": lookup_property(None, "Space", False),
"w": lookup_property(None, "WORD", True), "w": lookup_property(None, "Word", True),
"W": lookup_property(None, "WORD", False), "W": lookup_property(None, "Word", False),
} }
# Positional escape sequences. # Positional escape sequences.

File diff suppressed because it is too large Load Diff

View File

@ -41,6 +41,8 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_PROP_P 34 #define RE_PROP_P 34
#define RE_PROP_S 35 #define RE_PROP_S 35
#define RE_PROP_Z 36 #define RE_PROP_Z 36
#define RE_PROP_ASSIGNED 38
#define RE_PROP_CASEDLETTER 37
#define RE_PROP_CN 0 #define RE_PROP_CN 0
#define RE_PROP_LU 1 #define RE_PROP_LU 1
@ -84,19 +86,17 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_PROP_ALNUM 0x460001 #define RE_PROP_ALNUM 0x460001
#define RE_PROP_ALPHA 0x070001 #define RE_PROP_ALPHA 0x070001
#define RE_PROP_ANY 0x470001 #define RE_PROP_ANY 0x470001
#define RE_PROP_ASCII 0x480001 #define RE_PROP_ASCII 0x010001
#define RE_PROP_ASSIGNED 0x490001 #define RE_PROP_BLANK 0x480001
#define RE_PROP_BLANK 0x4A0001
#define RE_PROP_CNTRL 0x00000F #define RE_PROP_CNTRL 0x00000F
#define RE_PROP_DIGIT 0x000009 #define RE_PROP_DIGIT 0x000009
#define RE_PROP_GRAPH 0x4B0001 #define RE_PROP_GRAPH 0x490001
#define RE_PROP_LOWER 0x080001 #define RE_PROP_LOWER 0x080001
#define RE_PROP_PRINT 0x4C0001 #define RE_PROP_PRINT 0x4A0001
#define RE_PROP_PUNCT 0x000022
#define RE_PROP_SPACE 0x190001 #define RE_PROP_SPACE 0x190001
#define RE_PROP_UPPER 0x090001 #define RE_PROP_UPPER 0x090001
#define RE_PROP_WORD 0x4D0001 #define RE_PROP_WORD 0x4B0001
#define RE_PROP_XDIGIT 0x4E0001 #define RE_PROP_XDIGIT 0x4C0001
#define RE_BREAK_OTHER 0 #define RE_BREAK_OTHER 0
#define RE_BREAK_DOUBLEQUOTE 1 #define RE_BREAK_DOUBLEQUOTE 1
@ -130,11 +130,11 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_GBREAK_LVT 11 #define RE_GBREAK_LVT 11
#define RE_GBREAK_PREPEND 12 #define RE_GBREAK_PREPEND 12
extern char* re_strings[1155]; extern char* re_strings[1257];
extern RE_Property re_properties[145]; extern RE_Property re_properties[143];
extern RE_PropertyValue re_property_values[1244]; extern RE_PropertyValue re_property_values[1372];
extern RE_UINT16 re_expand_on_folding[104]; extern RE_UINT16 re_expand_on_folding[104];
extern RE_GetPropertyFunc re_get_property[79]; extern RE_GetPropertyFunc re_get_property[77];
RE_UINT32 re_get_general_category(RE_UINT32 ch); RE_UINT32 re_get_general_category(RE_UINT32 ch);
RE_UINT32 re_get_block(RE_UINT32 ch); RE_UINT32 re_get_block(RE_UINT32 ch);
@ -208,8 +208,6 @@ RE_UINT32 re_get_indic_matra_category(RE_UINT32 ch);
RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 ch); RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 ch);
RE_UINT32 re_get_alphanumeric(RE_UINT32 ch); RE_UINT32 re_get_alphanumeric(RE_UINT32 ch);
RE_UINT32 re_get_any(RE_UINT32 ch); RE_UINT32 re_get_any(RE_UINT32 ch);
RE_UINT32 re_get_ascii(RE_UINT32 ch);
RE_UINT32 re_get_assigned(RE_UINT32 ch);
RE_UINT32 re_get_blank(RE_UINT32 ch); RE_UINT32 re_get_blank(RE_UINT32 ch);
RE_UINT32 re_get_graph(RE_UINT32 ch); RE_UINT32 re_get_graph(RE_UINT32 ch);
RE_UINT32 re_get_print(RE_UINT32 ch); RE_UINT32 re_get_print(RE_UINT32 ch);