From f41faa558a509459a6de0e19f19996152cff9733 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 30 Jan 2014 18:59:24 +0530
Subject: [PATCH] Update regex module

---
 src/regex/__init__.py |   2 +-
 src/regex/_regex.c    | 186 ++++++++++++++++++++++++------------------
 2 files changed, 108 insertions(+), 80 deletions(-)

diff --git a/src/regex/__init__.py b/src/regex/__init__.py
index e54b5797a3..45d1920d6a 100644
--- a/src/regex/__init__.py
+++ b/src/regex/__init__.py
@@ -225,7 +225,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
   "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", "WORD", "error",
   "Regex"]
 
-__version__ = "2.4.37"
+__version__ = "2.4.39"
 
 # --------------------------------------------------------------------
 # Public interface.
diff --git a/src/regex/_regex.c b/src/regex/_regex.c
index 2a5698cba9..23ec638006 100644
--- a/src/regex/_regex.c
+++ b/src/regex/_regex.c
@@ -222,9 +222,6 @@ typedef struct RE_State* RE_StatePtr;
 /* Handlers for ASCII, locale and Unicode. */
 typedef struct RE_EncodingTable {
     BOOL (*has_property)(RE_CODE property, Py_UCS4 ch);
-    Py_UCS4 (*lower)(Py_UCS4 ch);
-    Py_UCS4 (*upper)(Py_UCS4 ch);
-    Py_UCS4 (*title)(Py_UCS4 ch);
     BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos);
     BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos);
     BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos);
@@ -774,22 +771,6 @@ static BOOL ascii_has_property(RE_CODE property, Py_UCS4 ch) {
     return unicode_has_property(property, ch);
 }
 
-/* Converts a character to lowercase. */
-static Py_UCS4 ascii_lower(Py_UCS4 ch) {
-    if (ch > RE_ASCII_MAX || !re_get_uppercase(ch))
-        return ch;
-
-    return ch ^ 0x20;
-}
-
-/* Converts a character to uppercase. */
-static Py_UCS4 ascii_upper(Py_UCS4 ch) {
-    if (ch > RE_ASCII_MAX || !re_get_lowercase(ch))
-        return ch;
-
-    return ch ^ 0x20;
-}
-
 /* Checks whether the current text position is on a word boundary. */
 static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) {
     BOOL before;
@@ -932,9 +913,6 @@ static int ascii_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
 /* The handlers for ASCII characters. */
 static RE_EncodingTable ascii_encoding = {
     ascii_has_property,
-    ascii_lower,
-    ascii_upper,
-    ascii_upper, /* For ASCII, titlecase is the same as uppercase. */
     ascii_at_boundary,
     ascii_at_word_start,
     ascii_at_word_end,
@@ -1035,22 +1013,6 @@ static BOOL locale_has_property(RE_CODE property, Py_UCS4 ch) {
     return v == value;
 }
 
-/* Converts a character to lowercase. */
-static Py_UCS4 locale_lower(Py_UCS4 ch) {
-    if (ch > RE_LOCALE_MAX)
-        return ch;
-
-    return tolower(ch);
-}
-
-/* Converts a character to uppercase. */
-static Py_UCS4 locale_upper(Py_UCS4 ch) {
-    if (ch > RE_LOCALE_MAX)
-        return ch;
-
-    return toupper(ch);
-}
-
 /* Checks whether the current text position is on a word boundary. */
 static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) {
     BOOL before;
@@ -1166,9 +1128,6 @@ static int locale_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
 /* The handlers for locale characters. */
 static RE_EncodingTable locale_encoding = {
     locale_has_property,
-    locale_lower,
-    locale_upper,
-    locale_upper, /* For locale, titlecase is the same as uppercase. */
     locale_at_boundary,
     locale_at_word_start,
     locale_at_word_end,
@@ -1226,21 +1185,6 @@ static BOOL unicode_has_property(RE_CODE property, Py_UCS4 ch) {
     return FALSE;
 }
 
-/* Converts a Unicode character to lowercase. */
-static Py_UCS4 unicode_lower(Py_UCS4 ch) {
-    return Py_UNICODE_TOLOWER((Py_UCS4)ch);
-}
-
-/* Converts a Unicode character to uppercase. */
-static Py_UCS4 unicode_upper(Py_UCS4 ch) {
-    return Py_UNICODE_TOUPPER((Py_UCS4)ch);
-}
-
-/* Converts a Unicode character to titlecase. */
-static Py_UCS4 unicode_title(Py_UCS4 ch) {
-    return Py_UNICODE_TOTITLE((Py_UCS4)ch);
-}
-
 /* Checks whether the current text position is on a word boundary. */
 static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) {
     BOOL before;
@@ -1285,7 +1229,7 @@ static BOOL unicode_at_word_end(RE_State* state, Py_ssize_t text_pos) {
  * Only a limited number are treated as vowels.
  */
 Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) {
-    switch (unicode_lower(ch)) {
+    switch (Py_UNICODE_TOLOWER(ch)) {
     case 'a': case 0xE0: case 0xE1: case 0xE2:
     case 'e': case 0xE8: case 0xE9: case 0xEA:
     case 'i': case 0xEC: case 0xED: case 0xEE:
@@ -1773,9 +1717,6 @@ static int unicode_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
 /* The handlers for Unicode characters. */
 static RE_EncodingTable unicode_encoding = {
     unicode_has_property,
-    unicode_lower,
-    unicode_upper,
-    unicode_title,
     unicode_at_boundary,
     unicode_at_word_start,
     unicode_at_word_end,
@@ -8521,6 +8462,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld(RE_SafeState* safe_state,
     new_node = bt_data->fuzzy_string.position.node;
     data.new_string_pos = bt_data->fuzzy_string.string_pos;
     data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
+    data.folded_len = bt_data->fuzzy_string.folded_len;
     data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
     data.step = bt_data->fuzzy_string.step;
 
@@ -8701,6 +8643,7 @@ Py_LOCAL_INLINE(BOOL) retry_fuzzy_match_string_fld2(RE_SafeState* safe_state,
     new_node = bt_data->fuzzy_string.position.node;
     new_group_pos = bt_data->fuzzy_string.string_pos;
     data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
+    data.folded_len = bt_data->fuzzy_string.folded_len;
     data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos;
     data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
     data.step = bt_data->fuzzy_string.step;
@@ -10953,10 +10896,10 @@ advance:
                         goto backtrack;
                     }
                 }
-
-                string_pos = -1;
             }
 
+            string_pos = -1;
+
             /* Successful match. */
             node = node->next_1.node;
             break;
@@ -11057,12 +11000,10 @@ advance:
                     }
                 }
 
-                if (folded_pos < folded_len) {
-                    string_pos = -1;
-                    goto backtrack;
-                }
-
                 string_pos = -1;
+
+                if (folded_pos < folded_len)
+                    goto backtrack;
             }
 
             /* Successful match. */
@@ -11165,12 +11106,10 @@ advance:
                     }
                 }
 
-                if (folded_pos > 0) {
-                    string_pos = -1;
-                    goto backtrack;
-                }
-
                 string_pos = -1;
+
+                if (folded_pos > 0)
+                    goto backtrack;
             }
 
             /* Successful match. */
@@ -11220,10 +11159,10 @@ advance:
                         goto backtrack;
                     }
                 }
-
-                string_pos = -1;
             }
 
+            string_pos = -1;
+
             /* Successful match. */
             node = node->next_1.node;
             break;
@@ -11271,10 +11210,10 @@ advance:
                         goto backtrack;
                     }
                 }
-
-                string_pos = -1;
             }
 
+            string_pos = -1;
+
             /* Successful match. */
             node = node->next_1.node;
             break;
@@ -11322,10 +11261,10 @@ advance:
                         goto backtrack;
                     }
                 }
-
-                string_pos = -1;
             }
 
+            string_pos = -1;
+
             /* Successful match. */
             node = node->next_1.node;
             break;
@@ -14494,7 +14433,7 @@ Py_LOCAL_INLINE(PyObject*) join_list_info(JoinInfo* join_info) {
  * Returns its length if it is a literal, otherwise -1.
  */
 Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement,
-  char special_char) {
+  unsigned char special_char) {
     RE_StringInfo str_info;
     Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
     Py_ssize_t pos;
@@ -16951,6 +16890,94 @@ Py_LOCAL_INLINE(BOOL) append_string(PyObject* list, char* string) {
     return TRUE;
 }
 
+/* Appends a (decimal) integer to a list. */
+Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) {
+    PyObject* int_obj;
+    PyObject* repr_obj;
+    int status;
+
+    int_obj = Py_BuildValue("n", value);
+    if (!int_obj)
+        return FALSE;
+
+    repr_obj = PyObject_Repr(int_obj);
+    Py_DECREF(int_obj);
+    if (!repr_obj)
+        return FALSE;
+
+    status = PyList_Append(list, repr_obj);
+    Py_DECREF(repr_obj);
+    if (status < 0)
+        return FALSE;
+
+    return TRUE;
+}
+
+/* MatchObject's '__repr__' method. */
+static PyObject* match_repr(PyObject* self_) {
+    MatchObject* self;
+    PyObject* list;
+    PyObject* matched_substring;
+    PyObject* matched_repr;
+    int status;
+    PyObject* separator;
+    PyObject* result;
+
+    self = (MatchObject*)self_;
+
+    list = PyList_New(0);
+    if (!list)
+        return NULL;
+
+    if (!append_string(list, "<regex.Match object; span=("))
+        goto error;
+
+    if (!append_integer(list, self->match_start))
+        goto error;
+
+    if (! append_string(list, ", "))
+        goto error;
+
+    if (!append_integer(list, self->match_end))
+        goto error;
+
+    if (!append_string(list, "), match="))
+        goto error;
+
+    matched_substring = get_slice(self->substring, self->match_start -
+      self->substring_offset, self->match_end - self->substring_offset);
+    if (!matched_substring)
+        goto error;
+
+    matched_repr = PyObject_Repr(matched_substring);
+    Py_DECREF(matched_substring);
+    if (!matched_repr)
+        goto error;
+
+    status = PyList_Append(list, matched_repr);
+    Py_DECREF(matched_repr);
+    if (status < 0)
+        goto error;
+
+    if (! append_string(list, ">"))
+        goto error;
+
+    separator = Py_BuildValue("s", "");
+    if (!separator)
+        goto error;
+
+    result = PyUnicode_Join(separator, list);
+    Py_DECREF(separator);
+
+    Py_DECREF(list);
+
+    return result;
+
+error:
+    Py_DECREF(list);
+    return NULL;
+}
+
 /* PatternObject's '__repr__' method. */
 static PyObject* pattern_repr(PyObject* self_) {
     PatternObject* self;
@@ -19873,6 +19900,7 @@ PyMODINIT_FUNC init_regex(void) {
 
     /* Initialise Match_Type. */
     Match_Type.tp_dealloc = match_dealloc;
+    Match_Type.tp_repr = match_repr;
     Match_Type.tp_as_mapping = &match_as_mapping;
     Match_Type.tp_flags = Py_TPFLAGS_DEFAULT;
     Match_Type.tp_doc = match_doc;