Update regex engine (fixes a thread safety bug)

This commit is contained in:
Kovid Goyal 2015-06-09 09:40:44 +05:30
parent c04db5b1ff
commit b51b73b530
5 changed files with 814 additions and 335 deletions

View File

@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
"Regex"] "Regex"]
__version__ = "2.4.61" __version__ = "2.4.64"
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# Public interface. # Public interface.

View File

@ -1194,6 +1194,18 @@ Py_LOCAL_INLINE(BOOL) locale_has_property(RE_LocaleInfo* locale_info, RE_CODE
case RE_PROP_LOWER >> 16: case RE_PROP_LOWER >> 16:
v = locale_islower(locale_info, ch); v = locale_islower(locale_info, ch);
break; break;
case RE_PROP_POSIX_ALNUM >> 16:
v = re_get_posix_alnum(ch) != 0;
break;
case RE_PROP_POSIX_DIGIT >> 16:
v = re_get_posix_digit(ch) != 0;
break;
case RE_PROP_POSIX_PUNCT >> 16:
v = re_get_posix_punct(ch) != 0;
break;
case RE_PROP_POSIX_XDIGIT >> 16:
v = re_get_posix_xdigit(ch) != 0;
break;
case RE_PROP_PRINT >> 16: case RE_PROP_PRINT >> 16:
v = locale_isprint(locale_info, ch); v = locale_isprint(locale_info, ch);
break; break;
@ -19101,8 +19113,8 @@ Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject*
#if PY_VERSION_HEX >= 0x02060000 #if PY_VERSION_HEX >= 0x02060000
BOOL built_capture = FALSE; BOOL built_capture = FALSE;
#endif #endif
PyObject* args = NULL; PyObject* args;
PyObject* kwargs = NULL; PyObject* kwargs;
Py_ssize_t end_pos; Py_ssize_t end_pos;
/* Get the string. */ /* Get the string. */

View File

@ -262,7 +262,7 @@ def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5
# Rebuild the arguments and locale-sensitivity dictionaries. # Rebuild the arguments and locale-sensitivity dictionaries.
args_dict.clear() args_dict.clear()
sensitivity_dict = {} sensitivity_dict = {}
for pattern, pattern_type, flags, args, default_version, locale in cache_dict: for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
args_dict[pattern, pattern_type, flags, default_version, locale] = args args_dict[pattern, pattern_type, flags, default_version, locale] = args
try: try:
sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
@ -292,6 +292,9 @@ def _compile_firstset(info, fs):
# If we ignore the case, for simplicity we won't build a firstset. # If we ignore the case, for simplicity we won't build a firstset.
members = set() members = set()
for i in fs: for i in fs:
if isinstance(i, Character) and not i.positive:
return []
if i.case_flags: if i.case_flags:
if isinstance(i, Character): if isinstance(i, Character):
if is_cased(info, i.value): if is_cased(info, i.value):
@ -1476,7 +1479,7 @@ def parse_posix_class(source, info):
if not source.match(":]"): if not source.match(":]"):
raise ParseError() raise ParseError()
return lookup_property(prop_name, name, not negate, source) return lookup_property(prop_name, name, not negate, source, posix=True)
def float_to_rational(flt): def float_to_rational(flt):
"Converts a float to a rational pair." "Converts a float to a rational pair."
@ -1517,7 +1520,9 @@ def standardise_name(name):
except (ValueError, ZeroDivisionError): except (ValueError, ZeroDivisionError):
return "".join(ch for ch in name if ch not in "_- ").upper() return "".join(ch for ch in name if ch not in "_- ").upper()
def lookup_property(property, value, positive, source=None): _posix_classes = set('ALNUM DIGIT PUNCT XDIGIT'.split())
def lookup_property(property, value, positive, source=None, posix=False):
"Looks up a property." "Looks up a property."
# Normalise the names (which may still be lists). # Normalise the names (which may still be lists).
property = standardise_name(property) if property else None property = standardise_name(property) if property else None
@ -1526,6 +1531,9 @@ def lookup_property(property, value, positive, source=None):
if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
if posix and not property and value.upper() in _posix_classes:
value = 'POSIX' + value
if property: if property:
# Both the property and the value are provided. # Both the property and the value are provided.
prop = PROPERTIES.get(property) prop = PROPERTIES.get(property)
@ -2650,11 +2658,10 @@ class Grapheme(RegexBase):
def _compile(self, reverse, fuzzy): def _compile(self, reverse, fuzzy):
# Match at least 1 character until a grapheme boundary is reached. Note # Match at least 1 character until a grapheme boundary is reached. Note
# that this is the same whether matching forwards or backwards. # that this is the same whether matching forwards or backwards.
character_matcher = LazyRepeat(AnyAll(), 1, None).compile(reverse, grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
fuzzy) GraphemeBoundary()]))
boundary_matcher = [(OP.GRAPHEME_BOUNDARY, 1)]
return character_matcher + boundary_matcher return grapheme_matcher.compile(reverse, fuzzy)
def _dump(self, indent, reverse): def _dump(self, indent, reverse):
print "%sGRAPHEME" % (INDENT * indent) print "%sGRAPHEME" % (INDENT * indent)
@ -2662,6 +2669,10 @@ class Grapheme(RegexBase):
def max_width(self): def max_width(self):
return UNLIMITED return UNLIMITED
class GraphemeBoundary:
def compile(self, reverse, fuzzy):
return [(OP.GRAPHEME_BOUNDARY, 1)]
class GreedyRepeat(RegexBase): class GreedyRepeat(RegexBase):
_opcode = OP.GREEDY_REPEAT _opcode = OP.GREEDY_REPEAT
_op_name = "GREEDY_REPEAT" _op_name = "GREEDY_REPEAT"

File diff suppressed because it is too large Load Diff

View File

@ -97,6 +97,10 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_PROP_UPPER 0x090001 #define RE_PROP_UPPER 0x090001
#define RE_PROP_WORD 0x4B0001 #define RE_PROP_WORD 0x4B0001
#define RE_PROP_XDIGIT 0x4C0001 #define RE_PROP_XDIGIT 0x4C0001
#define RE_PROP_POSIX_ALNUM 0x4E0001
#define RE_PROP_POSIX_DIGIT 0x4D0001
#define RE_PROP_POSIX_PUNCT 0x4F0001
#define RE_PROP_POSIX_XDIGIT 0x500001
#define RE_BREAK_OTHER 0 #define RE_BREAK_OTHER 0
#define RE_BREAK_DOUBLEQUOTE 1 #define RE_BREAK_DOUBLEQUOTE 1
@ -130,11 +134,11 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_GBREAK_LVT 11 #define RE_GBREAK_LVT 11
#define RE_GBREAK_PREPEND 12 #define RE_GBREAK_PREPEND 12
extern char* re_strings[1257]; extern char* re_strings[1261];
extern RE_Property re_properties[143]; extern RE_Property re_properties[147];
extern RE_PropertyValue re_property_values[1372]; extern RE_PropertyValue re_property_values[1372];
extern RE_UINT16 re_expand_on_folding[104]; extern RE_UINT16 re_expand_on_folding[104];
extern RE_GetPropertyFunc re_get_property[77]; extern RE_GetPropertyFunc re_get_property[81];
RE_UINT32 re_get_general_category(RE_UINT32 ch); RE_UINT32 re_get_general_category(RE_UINT32 ch);
RE_UINT32 re_get_block(RE_UINT32 ch); RE_UINT32 re_get_block(RE_UINT32 ch);
@ -213,6 +217,10 @@ RE_UINT32 re_get_graph(RE_UINT32 ch);
RE_UINT32 re_get_print(RE_UINT32 ch); RE_UINT32 re_get_print(RE_UINT32 ch);
RE_UINT32 re_get_word(RE_UINT32 ch); RE_UINT32 re_get_word(RE_UINT32 ch);
RE_UINT32 re_get_xdigit(RE_UINT32 ch); RE_UINT32 re_get_xdigit(RE_UINT32 ch);
RE_UINT32 re_get_posix_digit(RE_UINT32 ch);
RE_UINT32 re_get_posix_alnum(RE_UINT32 ch);
RE_UINT32 re_get_posix_punct(RE_UINT32 ch);
RE_UINT32 re_get_posix_xdigit(RE_UINT32 ch);
int re_get_all_cases(RE_UINT32 ch, RE_UINT32* codepoints); int re_get_all_cases(RE_UINT32 ch, RE_UINT32* codepoints);
RE_UINT32 re_get_simple_case_folding(RE_UINT32 ch); RE_UINT32 re_get_simple_case_folding(RE_UINT32 ch);
int re_get_full_case_folding(RE_UINT32 ch, RE_UINT32* codepoints); int re_get_full_case_folding(RE_UINT32 ch, RE_UINT32* codepoints);