From 641de015702e2cc93d89b75dd2cd0cab0890f07e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 27 Dec 2016 14:11:32 +0530
Subject: [PATCH] Update regex module

---
 src/regex/__init__.py    |  2 +-
 src/regex/_regex.c       | 75 +++++++++++++++++++++++++++++++++++-----
 src/regex/_regex_core.py |  7 ++--
 3 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/src/regex/__init__.py b/src/regex/__init__.py
index aba61ad388..454bbebcb4 100644
--- a/src/regex/__init__.py
+++ b/src/regex/__init__.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
   "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
   "WORD", "error", "Regex"]
 
-__version__ = "2.4.105"
+__version__ = "2.4.112"
 
 # --------------------------------------------------------------------
 # Public interface.
diff --git a/src/regex/_regex.c b/src/regex/_regex.c
index 84cec8413e..17c08e2f85 100644
--- a/src/regex/_regex.c
+++ b/src/regex/_regex.c
@@ -1547,6 +1547,15 @@ Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) {
     }
 }
 
+/* Checks whether a character is a Unicode apostrophe.
+ *
+ * This could be U+0027 (APOSTROPHE) or U+2019 (RIGHT SINGLE QUOTATION MARK /
+ * curly apostrophe).
+ */
+static BOOL is_unicode_apostrophe(Py_UCS4 ch) {
+    return ch == 0x27 || ch == 0x2019;
+}
+
 /* Checks whether a position is on a default word boundary.
  *
  * The rules are defined here:
@@ -1667,7 +1676,7 @@ static BOOL unicode_at_default_boundary(RE_State* state, Py_ssize_t text_pos) {
 
     /* Break between apostrophe and vowels (French, Italian). */
     /* WB5a */
-    if (pos_m1 >= 0 && char_at(state->text, pos_m1) == '\'' &&
+    if (pos_m1 >= 0 && is_unicode_apostrophe(char_at(state->text, pos_m1)) &&
       is_unicode_vowel(char_at(state->text, text_pos)))
         return TRUE;
 
@@ -1849,7 +1858,8 @@ Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state,
     if (prop_m1 == RE_BREAK_ALETTER && prop == RE_BREAK_ALETTER)
         return FALSE;
 
-    if (pos_m1 >= 0 && char_m1 == '\'' && is_unicode_vowel(char_0))
+    if (pos_m1 >= 0 && is_unicode_apostrophe(char_m1) &&
+      is_unicode_vowel(char_0))
         return TRUE;
 
     pos_p1 = text_pos + 1;
@@ -10019,7 +10029,7 @@ Py_LOCAL_INLINE(BOOL) any_error_permitted(RE_State* state) {
 
     return fuzzy_info->total_cost <= values[RE_FUZZY_VAL_MAX_COST] &&
       fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MAX_ERR] &&
-      state->total_errors <= state->max_errors;
+      state->total_errors < state->max_errors;
 }
 
 /* Checks whether this additional fuzzy error is permitted. */
@@ -10032,7 +10042,7 @@ Py_LOCAL_INLINE(BOOL) this_error_permitted(RE_State* state, int fuzzy_type) {
 
     return fuzzy_info->total_cost + values[RE_FUZZY_VAL_COST_BASE + fuzzy_type]
       <= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[fuzzy_type] <
-      values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors + 1 <=
+      values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors <
       state->max_errors;
 }
 
@@ -10064,6 +10074,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
         switch (data->fuzzy_type) {
         case RE_FUZZY_DEL:
             /* Could a character at text_pos have been deleted? */
+            if (step == 0)
+                return RE_ERROR_FAILURE;
+
             if (is_string)
                 data->new_string_pos += step;
             else
@@ -10074,7 +10087,10 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
             if (!data->permit_insertion)
                 return RE_ERROR_FAILURE;
 
-            new_pos = data->new_text_pos + step;
+            if (step == 0)
+                new_pos = data->new_text_pos + data->step;
+            else
+                new_pos = data->new_text_pos + step;
             if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
                 data->new_text_pos = new_pos;
                 return RE_ERROR_SUCCESS;
@@ -10083,6 +10099,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
             return check_fuzzy_partial(state, new_pos);
         case RE_FUZZY_SUB:
             /* Could the character at text_pos have been substituted? */
+            if (step == 0)
+                return RE_ERROR_FAILURE;
+
             new_pos = data->new_text_pos + step;
             if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
                 data->new_text_pos = new_pos;
@@ -14808,7 +14827,10 @@ backtrack:
                  * backtracked inside and already restored the groups. We also
                  * need to restore certain flags.
                  */
-                if (bt_data->lookaround.node->match)
+                RE_Node* node;
+
+                node = bt_data->lookaround.node;
+                if (node->match && (node->status & RE_STATUS_HAS_GROUPS))
                     pop_groups(state);
 
                 state->too_few_errors = bt_data->lookaround.too_few_errors;
@@ -16602,13 +16624,14 @@ Py_LOCAL_INLINE(int) do_best_fuzzy_match(RE_SafeState* safe_state, BOOL search)
                  */
                 add_to_best_list(safe_state, &best_list, state->match_pos,
                   state->text_pos);
-        }
+        } else
+            start_pos = state->match_pos + step;
 
         /* Should we keep searching? */
         if (!search)
             break;
 
-        start_pos = state->match_pos + step;
+        state->max_errors = fewest_errors - 1;
     }
 
     if (found_match) {
@@ -17103,7 +17126,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
      * instead.
      */
     if (PyUnicode_Check(string)) {
-        /* Unicode strings doesn't always support the buffer interface. */
+        /* Unicode strings don't always support the buffer interface. */
         str_info->characters = (void*)PyUnicode_AS_DATA(string);
         str_info->length = PyUnicode_GET_SIZE(string);
         str_info->charsize = sizeof(Py_UNICODE);
@@ -17112,6 +17135,39 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
         return TRUE;
     }
 
+#if defined(PYPY_VERSION)
+    if (PyString_Check(string)) {
+        /* Bytestrings don't always support the buffer interface. */
+        str_info->characters = (void*)PyString_AS_STRING(string);
+        str_info->length = PyString_GET_SIZE(string);
+        str_info->charsize = 1;
+        str_info->is_unicode = FALSE;
+        str_info->should_release = FALSE;
+        return TRUE;
+    }
+
+#endif
+#if defined(PYPY_VERSION)
+    /* Get pointer to string buffer. */
+    if (PyObject_GetBuffer(string, &str_info->view, PyBUF_SIMPLE) != 0) {
+        printf("PyObject_GetBuffer failed!\n");
+        PyErr_SetString(PyExc_TypeError, "expected string or buffer");
+        return FALSE;
+    }
+
+    if (!str_info->view.buf) {
+        PyBuffer_Release(&str_info->view);
+        PyErr_SetString(PyExc_ValueError, "buffer is NULL");
+        return FALSE;
+    }
+
+    str_info->should_release = TRUE;
+
+    str_info->characters = str_info->view.buf;
+    str_info->length = str_info->view.len;
+    str_info->charsize = 1;
+    str_info->is_unicode = FALSE;
+#else
     /* Get pointer to string buffer. */
 #if PY_VERSION_HEX >= 0x02060000
     buffer = Py_TYPE(string)->tp_as_buffer;
@@ -17183,6 +17239,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
 
     str_info->length = size;
     str_info->is_unicode = FALSE;
+#endif
 
     return TRUE;
 }
diff --git a/src/regex/_regex_core.py b/src/regex/_regex_core.py
index fba42b7f57..4b65078eda 100644
--- a/src/regex/_regex_core.py
+++ b/src/regex/_regex_core.py
@@ -14,7 +14,6 @@
 # 2010-01-16 mrab Python front-end re-written and extended
 
 import string
-import sys
 import unicodedata
 from collections import defaultdict
 
@@ -23,7 +22,6 @@ _regex = plugins['_regex'][0]
 if _regex is None:
     raise RuntimeError('Failed to load regex module with error: ' + plugins['_regex'][1])
 
-
 __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
   "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
   "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
@@ -2842,6 +2840,9 @@ class GreedyRepeat(RegexBase):
     def is_atomic(self):
         return self.min_count == self.max_count and self.subpattern.is_atomic()
 
+    def can_be_affix(self):
+        return False
+
     def contains_group(self):
         return self.subpattern.contains_group()
 
@@ -3114,7 +3115,7 @@ class LookAroundConditional(RegexBase):
         print("%sEITHER" % (INDENT * indent))
         self.yes_item.dump(indent + 1, reverse)
         if not self.no_item.is_empty():
-            print("%sOR".format(INDENT * indent))
+            print("%sOR" % (INDENT * indent))
             self.no_item.dump(indent + 1, reverse)
 
     def is_empty(self):