Edit Book: Update regex engine to support Unicode 9.0

This commit is contained in:
Kovid Goyal 2016-07-21 09:27:48 +05:30
parent a99da9be69
commit 24c7756df1
6 changed files with 10610 additions and 7584 deletions

View File

@ -76,9 +76,21 @@ The special characters are:
(?<!...) Matches if not preceded by ....
(?(id)yes|no) Matches yes pattern if group id matched, the (optional)
no pattern otherwise.
(?(DEFINE)...) If there's no group called "DEFINE", then ... will be
ignored, but any group definitions will be available.
(?|...|...) (?|A|B), creates an RE that will match either A or B,
but reuses capture group numbers across the
alternatives.
(*FAIL) Forces matching to fail, which means immediate
backtracking.
(*F) Abbreviation for (*FAIL).
(*PRUNE) Discards the current backtracking information. Its
effect doesn't extend outside an atomic group or a
lookaround.
(*SKIP) Similar to (*PRUNE), except that it also sets where in
the text the next attempt at matching the entire
pattern will start. Its effect doesn't extend outside
an atomic group or a lookaround.
The fuzzy matching constraints are: "i" to permit insertions, "d" to permit
deletions, "s" to permit substitutions, "e" to permit any of these. Limits are
@ -124,6 +136,7 @@ second character.
\g<name> Matches the text matched by the group named name.
\G Matches the empty string, but only at the position where
the search started.
\K Keeps only what follows for the entire match.
\L<name> Named list. The list is provided as a keyword argument.
\m Matches the empty string, but only at the start of a word.
\M Matches the empty string, but only at the end of a word.
@ -188,6 +201,8 @@ these flags can also be set within an RE:
when matching a bytestring.
B b BESTMATCH Find the best fuzzy match (default is first).
D DEBUG Print the parsed pattern.
E e ENHANCEMATCH Attempt to improve the fit after finding the first
fuzzy match.
F f FULLCASE Use full case-folding when performing
case-insensitive matching in Unicode.
I i IGNORECASE Perform case-insensitive matching.
@ -196,8 +211,7 @@ these flags can also be set within an RE:
M m MULTILINE "^" matches the beginning of lines (after a newline)
as well as the string. "$" matches the end of lines
(before a newline) as well as the end of the string.
E e ENHANCEMATCH Attempt to improve the fit after finding the first
fuzzy match.
P p POSIX Perform POSIX-standard matching (leftmost longest).
R r REVERSE Searches backwards.
S s DOTALL "." matches any character at all, including the
newline.
@ -221,11 +235,11 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"purge", "search", "split", "splititer", "sub", "subf", "subfn", "subn",
"template", "Scanner", "A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E",
"ENHANCEMATCH", "S", "DOTALL", "F", "FULLCASE", "I", "IGNORECASE", "L",
"LOCALE", "M", "MULTILINE", "R", "REVERSE", "T", "TEMPLATE", "U", "UNICODE",
"V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
"Regex"]
"LOCALE", "M", "MULTILINE", "P", "POSIX", "R", "REVERSE", "T", "TEMPLATE",
"U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
"WORD", "error", "Regex"]
__version__ = "2.4.66"
__version__ = "2.4.105"
# --------------------------------------------------------------------
# Public interface.
@ -341,50 +355,27 @@ def template(pattern, flags=0):
def escape(pattern, special_only=False):
"Escape all non-alphanumeric characters or special characters in pattern."
if isinstance(pattern, unicode):
s = []
if special_only:
for c in pattern:
if c in _METACHARS:
s.append(u"\\")
s.append(c)
elif c == u"\x00":
s.append(u"\\000")
else:
s.append(c)
else:
for c in pattern:
if c in _ALNUM:
s.append(c)
elif c == u"\x00":
s.append(u"\\000")
else:
s.append(u"\\")
s.append(c)
return u"".join(s)
s = []
if special_only:
for c in pattern:
if c in _METACHARS:
s.append("\\")
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append(c)
else:
s = []
if special_only:
for c in pattern:
if c in _METACHARS:
s.append("\\")
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append(c)
else:
for c in pattern:
if c in _ALNUM:
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append("\\")
s.append(c)
for c in pattern:
if c in _ALNUM:
s.append(c)
elif c == "\x00":
s.append("\\000")
else:
s.append("\\")
s.append(c)
return "".join(s)
return pattern[ : 0].join(s)
# --------------------------------------------------------------------
# Internals.
@ -478,10 +469,10 @@ def _compile(pattern, flags=0, kwargs={}):
# Set the default version in the core code in case it has been changed.
_regex_core.DEFAULT_VERSION = DEFAULT_VERSION
caught_exception = None
global_flags = flags
while True:
caught_exception = None
try:
source = _Source(pattern)
info = _Info(global_flags, source.char_type, kwargs)
@ -522,15 +513,23 @@ def _compile(pattern, flags=0, kwargs={}):
# Remember whether this pattern as an inline locale flag.
_locale_sensitive[locale_key] = info.inline_locale
# Fix the group references.
caught_exception = None
try:
parsed.fix_groups(pattern, reverse, False)
except error, e:
caught_exception = e
if caught_exception:
raise error(caught_exception.msg, caught_exception.pattern,
caught_exception.pos)
# Should we print the parsed pattern?
if flags & DEBUG:
parsed.dump(indent=0, reverse=reverse)
# Fix the group references.
parsed.fix_groups(pattern, reverse, False)
# Optimise the parsed pattern.
parsed = parsed.optimise(info)
parsed = parsed.optimise(info, reverse)
parsed = parsed.pack_characters(info)
# Get the required string.
@ -680,10 +679,10 @@ Regex = compile
# Register myself for pickling.
import copy_reg as _copy_reg
def _pickle(p):
return _compile, (p.pattern, p.flags)
def _pickle(pattern):
return _regex.compile, pattern._pickled_data
_copy_reg.pickle(_pattern_type, _pickle, _compile)
_copy_reg.pickle(_pattern_type, _pickle)
if not hasattr(str, "format"):
# Strings don't have the .format method (below Python 2.6).

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@
* 2010-01-16 mrab Re-written
*/
/* Supports Unicode version 7.0.0. */
/* Supports Unicode version 9.0.0. */
#define RE_MAGIC 20100116
@ -34,84 +34,91 @@
#define RE_OP_CHARACTER_IGN 13
#define RE_OP_CHARACTER_IGN_REV 14
#define RE_OP_CHARACTER_REV 15
#define RE_OP_DEFAULT_BOUNDARY 16
#define RE_OP_DEFAULT_END_OF_WORD 17
#define RE_OP_DEFAULT_START_OF_WORD 18
#define RE_OP_END 19
#define RE_OP_END_OF_LINE 20
#define RE_OP_END_OF_LINE_U 21
#define RE_OP_END_OF_STRING 22
#define RE_OP_END_OF_STRING_LINE 23
#define RE_OP_END_OF_STRING_LINE_U 24
#define RE_OP_END_OF_WORD 25
#define RE_OP_FUZZY 26
#define RE_OP_GRAPHEME_BOUNDARY 27
#define RE_OP_GREEDY_REPEAT 28
#define RE_OP_GROUP 29
#define RE_OP_GROUP_CALL 30
#define RE_OP_GROUP_EXISTS 31
#define RE_OP_LAZY_REPEAT 32
#define RE_OP_LOOKAROUND 33
#define RE_OP_NEXT 34
#define RE_OP_PROPERTY 35
#define RE_OP_PROPERTY_IGN 36
#define RE_OP_PROPERTY_IGN_REV 37
#define RE_OP_PROPERTY_REV 38
#define RE_OP_RANGE 39
#define RE_OP_RANGE_IGN 40
#define RE_OP_RANGE_IGN_REV 41
#define RE_OP_RANGE_REV 42
#define RE_OP_REF_GROUP 43
#define RE_OP_REF_GROUP_FLD 44
#define RE_OP_REF_GROUP_FLD_REV 45
#define RE_OP_REF_GROUP_IGN 46
#define RE_OP_REF_GROUP_IGN_REV 47
#define RE_OP_REF_GROUP_REV 48
#define RE_OP_SEARCH_ANCHOR 49
#define RE_OP_SET_DIFF 50
#define RE_OP_SET_DIFF_IGN 51
#define RE_OP_SET_DIFF_IGN_REV 52
#define RE_OP_SET_DIFF_REV 53
#define RE_OP_SET_INTER 54
#define RE_OP_SET_INTER_IGN 55
#define RE_OP_SET_INTER_IGN_REV 56
#define RE_OP_SET_INTER_REV 57
#define RE_OP_SET_SYM_DIFF 58
#define RE_OP_SET_SYM_DIFF_IGN 59
#define RE_OP_SET_SYM_DIFF_IGN_REV 60
#define RE_OP_SET_SYM_DIFF_REV 61
#define RE_OP_SET_UNION 62
#define RE_OP_SET_UNION_IGN 63
#define RE_OP_SET_UNION_IGN_REV 64
#define RE_OP_SET_UNION_REV 65
#define RE_OP_START_OF_LINE 66
#define RE_OP_START_OF_LINE_U 67
#define RE_OP_START_OF_STRING 68
#define RE_OP_START_OF_WORD 69
#define RE_OP_STRING 70
#define RE_OP_STRING_FLD 71
#define RE_OP_STRING_FLD_REV 72
#define RE_OP_STRING_IGN 73
#define RE_OP_STRING_IGN_REV 74
#define RE_OP_STRING_REV 75
#define RE_OP_STRING_SET 76
#define RE_OP_STRING_SET_FLD 77
#define RE_OP_STRING_SET_FLD_REV 78
#define RE_OP_STRING_SET_IGN 79
#define RE_OP_STRING_SET_IGN_REV 80
#define RE_OP_STRING_SET_REV 81
#define RE_OP_BODY_END 82
#define RE_OP_BODY_START 83
#define RE_OP_END_FUZZY 84
#define RE_OP_END_GREEDY_REPEAT 85
#define RE_OP_END_GROUP 86
#define RE_OP_END_LAZY_REPEAT 87
#define RE_OP_GREEDY_REPEAT_ONE 88
#define RE_OP_GROUP_RETURN 89
#define RE_OP_LAZY_REPEAT_ONE 90
#define RE_OP_MATCH_BODY 91
#define RE_OP_MATCH_TAIL 92
#define RE_OP_START_GROUP 93
#define RE_OP_CONDITIONAL 16
#define RE_OP_DEFAULT_BOUNDARY 17
#define RE_OP_DEFAULT_END_OF_WORD 18
#define RE_OP_DEFAULT_START_OF_WORD 19
#define RE_OP_END 20
#define RE_OP_END_OF_LINE 21
#define RE_OP_END_OF_LINE_U 22
#define RE_OP_END_OF_STRING 23
#define RE_OP_END_OF_STRING_LINE 24
#define RE_OP_END_OF_STRING_LINE_U 25
#define RE_OP_END_OF_WORD 26
#define RE_OP_FUZZY 27
#define RE_OP_GRAPHEME_BOUNDARY 28
#define RE_OP_GREEDY_REPEAT 29
#define RE_OP_GROUP 30
#define RE_OP_GROUP_CALL 31
#define RE_OP_GROUP_EXISTS 32
#define RE_OP_KEEP 33
#define RE_OP_LAZY_REPEAT 34
#define RE_OP_LOOKAROUND 35
#define RE_OP_NEXT 36
#define RE_OP_PROPERTY 37
#define RE_OP_PROPERTY_IGN 38
#define RE_OP_PROPERTY_IGN_REV 39
#define RE_OP_PROPERTY_REV 40
#define RE_OP_PRUNE 41
#define RE_OP_RANGE 42
#define RE_OP_RANGE_IGN 43
#define RE_OP_RANGE_IGN_REV 44
#define RE_OP_RANGE_REV 45
#define RE_OP_REF_GROUP 46
#define RE_OP_REF_GROUP_FLD 47
#define RE_OP_REF_GROUP_FLD_REV 48
#define RE_OP_REF_GROUP_IGN 49
#define RE_OP_REF_GROUP_IGN_REV 50
#define RE_OP_REF_GROUP_REV 51
#define RE_OP_SEARCH_ANCHOR 52
#define RE_OP_SET_DIFF 53
#define RE_OP_SET_DIFF_IGN 54
#define RE_OP_SET_DIFF_IGN_REV 55
#define RE_OP_SET_DIFF_REV 56
#define RE_OP_SET_INTER 57
#define RE_OP_SET_INTER_IGN 58
#define RE_OP_SET_INTER_IGN_REV 59
#define RE_OP_SET_INTER_REV 60
#define RE_OP_SET_SYM_DIFF 61
#define RE_OP_SET_SYM_DIFF_IGN 62
#define RE_OP_SET_SYM_DIFF_IGN_REV 63
#define RE_OP_SET_SYM_DIFF_REV 64
#define RE_OP_SET_UNION 65
#define RE_OP_SET_UNION_IGN 66
#define RE_OP_SET_UNION_IGN_REV 67
#define RE_OP_SET_UNION_REV 68
#define RE_OP_SKIP 69
#define RE_OP_START_OF_LINE 70
#define RE_OP_START_OF_LINE_U 71
#define RE_OP_START_OF_STRING 72
#define RE_OP_START_OF_WORD 73
#define RE_OP_STRING 74
#define RE_OP_STRING_FLD 75
#define RE_OP_STRING_FLD_REV 76
#define RE_OP_STRING_IGN 77
#define RE_OP_STRING_IGN_REV 78
#define RE_OP_STRING_REV 79
#define RE_OP_STRING_SET 80
#define RE_OP_STRING_SET_FLD 81
#define RE_OP_STRING_SET_FLD_REV 82
#define RE_OP_STRING_SET_IGN 83
#define RE_OP_STRING_SET_IGN_REV 84
#define RE_OP_STRING_SET_REV 85
#define RE_OP_BODY_END 86
#define RE_OP_BODY_START 87
#define RE_OP_END_ATOMIC 88
#define RE_OP_END_CONDITIONAL 89
#define RE_OP_END_FUZZY 90
#define RE_OP_END_GREEDY_REPEAT 91
#define RE_OP_END_GROUP 92
#define RE_OP_END_LAZY_REPEAT 93
#define RE_OP_END_LOOKAROUND 94
#define RE_OP_GREEDY_REPEAT_ONE 95
#define RE_OP_GROUP_RETURN 96
#define RE_OP_LAZY_REPEAT_ONE 97
#define RE_OP_MATCH_BODY 98
#define RE_OP_MATCH_TAIL 99
#define RE_OP_START_GROUP 100
char* re_op_text[] = {
"RE_OP_FAILURE",
@ -130,6 +137,7 @@ char* re_op_text[] = {
"RE_OP_CHARACTER_IGN",
"RE_OP_CHARACTER_IGN_REV",
"RE_OP_CHARACTER_REV",
"RE_OP_CONDITIONAL",
"RE_OP_DEFAULT_BOUNDARY",
"RE_OP_DEFAULT_END_OF_WORD",
"RE_OP_DEFAULT_START_OF_WORD",
@ -146,6 +154,7 @@ char* re_op_text[] = {
"RE_OP_GROUP",
"RE_OP_GROUP_CALL",
"RE_OP_GROUP_EXISTS",
"RE_OP_KEEP",
"RE_OP_LAZY_REPEAT",
"RE_OP_LOOKAROUND",
"RE_OP_NEXT",
@ -153,6 +162,7 @@ char* re_op_text[] = {
"RE_OP_PROPERTY_IGN",
"RE_OP_PROPERTY_IGN_REV",
"RE_OP_PROPERTY_REV",
"RE_OP_PRUNE",
"RE_OP_RANGE",
"RE_OP_RANGE_IGN",
"RE_OP_RANGE_IGN_REV",
@ -180,6 +190,7 @@ char* re_op_text[] = {
"RE_OP_SET_UNION_IGN",
"RE_OP_SET_UNION_IGN_REV",
"RE_OP_SET_UNION_REV",
"RE_OP_SKIP",
"RE_OP_START_OF_LINE",
"RE_OP_START_OF_LINE_U",
"RE_OP_START_OF_STRING",
@ -198,10 +209,13 @@ char* re_op_text[] = {
"RE_OP_STRING_SET_REV",
"RE_OP_BODY_END",
"RE_OP_BODY_START",
"RE_OP_END_ATOMIC",
"RE_OP_END_CONDITIONAL",
"RE_OP_END_FUZZY",
"RE_OP_END_GREEDY_REPEAT",
"RE_OP_END_GROUP",
"RE_OP_END_LAZY_REPEAT",
"RE_OP_END_LOOKAROUND",
"RE_OP_GREEDY_REPEAT_ONE",
"RE_OP_GROUP_RETURN",
"RE_OP_LAZY_REPEAT_ONE",
@ -219,6 +233,7 @@ char* re_op_text[] = {
#define RE_FLAG_IGNORECASE 0x2
#define RE_FLAG_LOCALE 0x4
#define RE_FLAG_MULTILINE 0x8
#define RE_FLAG_POSIX 0x10000
#define RE_FLAG_REVERSE 0x400
#define RE_FLAG_TEMPLATE 0x1
#define RE_FLAG_UNICODE 0x20

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ typedef struct RE_Property {
typedef struct RE_PropertyValue {
RE_UINT16 name;
RE_UINT8 value_set;
RE_UINT8 id;
RE_UINT16 id;
} RE_PropertyValue;
typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
@ -83,24 +83,24 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_PROP_S_MASK 0x0F000000
#define RE_PROP_Z_MASK 0x00007000
#define RE_PROP_ALNUM 0x460001
#define RE_PROP_ALNUM 0x470001
#define RE_PROP_ALPHA 0x070001
#define RE_PROP_ANY 0x470001
#define RE_PROP_ANY 0x480001
#define RE_PROP_ASCII 0x010001
#define RE_PROP_BLANK 0x480001
#define RE_PROP_BLANK 0x490001
#define RE_PROP_CNTRL 0x00000F
#define RE_PROP_DIGIT 0x000009
#define RE_PROP_GRAPH 0x490001
#define RE_PROP_GRAPH 0x4A0001
#define RE_PROP_LOWER 0x080001
#define RE_PROP_PRINT 0x4A0001
#define RE_PROP_PRINT 0x4B0001
#define RE_PROP_SPACE 0x190001
#define RE_PROP_UPPER 0x090001
#define RE_PROP_WORD 0x4B0001
#define RE_PROP_XDIGIT 0x4C0001
#define RE_PROP_POSIX_ALNUM 0x4E0001
#define RE_PROP_POSIX_DIGIT 0x4D0001
#define RE_PROP_POSIX_PUNCT 0x4F0001
#define RE_PROP_POSIX_XDIGIT 0x500001
#define RE_PROP_WORD 0x4C0001
#define RE_PROP_XDIGIT 0x4D0001
#define RE_PROP_POSIX_ALNUM 0x4F0001
#define RE_PROP_POSIX_DIGIT 0x4E0001
#define RE_PROP_POSIX_PUNCT 0x500001
#define RE_PROP_POSIX_XDIGIT 0x510001
#define RE_BREAK_OTHER 0
#define RE_BREAK_DOUBLEQUOTE 1
@ -119,26 +119,36 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_BREAK_MIDNUMLET 14
#define RE_BREAK_NUMERIC 15
#define RE_BREAK_EXTENDNUMLET 16
#define RE_BREAK_EBASE 17
#define RE_BREAK_EMODIFIER 18
#define RE_BREAK_ZWJ 19
#define RE_BREAK_GLUEAFTERZWJ 20
#define RE_BREAK_EBASEGAZ 21
#define RE_GBREAK_OTHER 0
#define RE_GBREAK_CR 1
#define RE_GBREAK_LF 2
#define RE_GBREAK_CONTROL 3
#define RE_GBREAK_EXTEND 4
#define RE_GBREAK_REGIONALINDICATOR 5
#define RE_GBREAK_SPACINGMARK 6
#define RE_GBREAK_L 7
#define RE_GBREAK_V 8
#define RE_GBREAK_T 9
#define RE_GBREAK_LV 10
#define RE_GBREAK_LVT 11
#define RE_GBREAK_PREPEND 12
#define RE_GBREAK_PREPEND 1
#define RE_GBREAK_CR 2
#define RE_GBREAK_LF 3
#define RE_GBREAK_CONTROL 4
#define RE_GBREAK_EXTEND 5
#define RE_GBREAK_REGIONALINDICATOR 6
#define RE_GBREAK_SPACINGMARK 7
#define RE_GBREAK_L 8
#define RE_GBREAK_V 9
#define RE_GBREAK_T 10
#define RE_GBREAK_LV 11
#define RE_GBREAK_LVT 12
#define RE_GBREAK_EBASE 13
#define RE_GBREAK_EMODIFIER 14
#define RE_GBREAK_ZWJ 15
#define RE_GBREAK_GLUEAFTERZWJ 16
#define RE_GBREAK_EBASEGAZ 17
extern char* re_strings[1261];
extern RE_Property re_properties[147];
extern RE_PropertyValue re_property_values[1372];
extern char* re_strings[1336];
extern RE_Property re_properties[150];
extern RE_PropertyValue re_property_values[1469];
extern RE_UINT16 re_expand_on_folding[104];
extern RE_GetPropertyFunc re_get_property[81];
extern RE_GetPropertyFunc re_get_property[82];
RE_UINT32 re_get_general_category(RE_UINT32 ch);
RE_UINT32 re_get_block(RE_UINT32 ch);
@ -193,10 +203,11 @@ RE_UINT32 re_get_soft_dotted(RE_UINT32 ch);
RE_UINT32 re_get_logical_order_exception(RE_UINT32 ch);
RE_UINT32 re_get_other_id_start(RE_UINT32 ch);
RE_UINT32 re_get_other_id_continue(RE_UINT32 ch);
RE_UINT32 re_get_sterm(RE_UINT32 ch);
RE_UINT32 re_get_sentence_terminal(RE_UINT32 ch);
RE_UINT32 re_get_variation_selector(RE_UINT32 ch);
RE_UINT32 re_get_pattern_white_space(RE_UINT32 ch);
RE_UINT32 re_get_pattern_syntax(RE_UINT32 ch);
RE_UINT32 re_get_prepended_concatenation_mark(RE_UINT32 ch);
RE_UINT32 re_get_hangul_syllable_type(RE_UINT32 ch);
RE_UINT32 re_get_bidi_class(RE_UINT32 ch);
RE_UINT32 re_get_canonical_combining_class(RE_UINT32 ch);
@ -208,7 +219,7 @@ RE_UINT32 re_get_line_break(RE_UINT32 ch);
RE_UINT32 re_get_numeric_type(RE_UINT32 ch);
RE_UINT32 re_get_numeric_value(RE_UINT32 ch);
RE_UINT32 re_get_bidi_mirrored(RE_UINT32 ch);
RE_UINT32 re_get_indic_matra_category(RE_UINT32 ch);
RE_UINT32 re_get_indic_positional_category(RE_UINT32 ch);
RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 ch);
RE_UINT32 re_get_alphanumeric(RE_UINT32 ch);
RE_UINT32 re_get_any(RE_UINT32 ch);