Update regex module

2025-08-30 23:00:21 -04:00 · 2016-12-27 14:11:32 +05:30 · 2016-12-27 14:11:32 +05:30 · 641de01570
commit 641de01570
parent 0cc55a1346
3 changed files with 71 additions and 13 deletions
--- a/src/regex/init.py
+++ b/src/regex/init.py
@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
  "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
  "WORD", "error", "Regex"]

-__version__ = "2.4.105"
+__version__ = "2.4.112"

 # --------------------------------------------------------------------
 # Public interface.
--- a/src/regex/_regex.c
+++ b/src/regex/_regex.c
@ -1547,6 +1547,15 @@ Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) {
    }
 }

+/* Checks whether a character is a Unicode apostrophe.
+ *
+ * This could be U+0027 (APOSTROPHE) or U+2019 (RIGHT SINGLE QUOTATION MARK /
+ * curly apostrophe).
+ */
+static BOOL is_unicode_apostrophe(Py_UCS4 ch) {
+    return ch == 0x27 || ch == 0x2019;
+}
+
 /* Checks whether a position is on a default word boundary.
 *
 * The rules are defined here:
@ -1667,7 +1676,7 @@ static BOOL unicode_at_default_boundary(RE_State* state, Py_ssize_t text_pos) {

    /* Break between apostrophe and vowels (French, Italian). */
    /* WB5a */
-    if (pos_m1 >= 0 && char_at(state->text, pos_m1) == '\'' &&
+    if (pos_m1 >= 0 && is_unicode_apostrophe(char_at(state->text, pos_m1)) &&
      is_unicode_vowel(char_at(state->text, text_pos)))
        return TRUE;

@ -1849,7 +1858,8 @@ Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state,
    if (prop_m1 == RE_BREAK_ALETTER && prop == RE_BREAK_ALETTER)
        return FALSE;

-    if (pos_m1 >= 0 && char_m1 == '\'' && is_unicode_vowel(char_0))
+    if (pos_m1 >= 0 && is_unicode_apostrophe(char_m1) &&
+      is_unicode_vowel(char_0))
        return TRUE;

    pos_p1 = text_pos + 1;
@ -10019,7 +10029,7 @@ Py_LOCAL_INLINE(BOOL) any_error_permitted(RE_State* state) {

    return fuzzy_info->total_cost <= values[RE_FUZZY_VAL_MAX_COST] &&
      fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MAX_ERR] &&
-      state->total_errors <= state->max_errors;
+      state->total_errors < state->max_errors;
 }

 /* Checks whether this additional fuzzy error is permitted. */
@ -10032,7 +10042,7 @@ Py_LOCAL_INLINE(BOOL) this_error_permitted(RE_State* state, int fuzzy_type) {

    return fuzzy_info->total_cost + values[RE_FUZZY_VAL_COST_BASE + fuzzy_type]
      <= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[fuzzy_type] <
-      values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors + 1 <=
+      values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors <
      state->max_errors;
 }

@ -10064,6 +10074,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
        switch (data->fuzzy_type) {
        case RE_FUZZY_DEL:
            /* Could a character at text_pos have been deleted? */
+            if (step == 0)
+                return RE_ERROR_FAILURE;
+
            if (is_string)
                data->new_string_pos += step;
            else
@ -10074,7 +10087,10 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
            if (!data->permit_insertion)
                return RE_ERROR_FAILURE;

-            new_pos = data->new_text_pos + step;
+            if (step == 0)
+                new_pos = data->new_text_pos + data->step;
+            else
+                new_pos = data->new_text_pos + step;
            if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
                data->new_text_pos = new_pos;
                return RE_ERROR_SUCCESS;
@ -10083,6 +10099,9 @@ Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
            return check_fuzzy_partial(state, new_pos);
        case RE_FUZZY_SUB:
            /* Could the character at text_pos have been substituted? */
+            if (step == 0)
+                return RE_ERROR_FAILURE;
+
            new_pos = data->new_text_pos + step;
            if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
                data->new_text_pos = new_pos;
@ -14808,7 +14827,10 @@ backtrack:
                 * backtracked inside and already restored the groups. We also
                 * need to restore certain flags.
                 */
-                if (bt_data->lookaround.node->match)
+                RE_Node* node;
+
+                node = bt_data->lookaround.node;
+                if (node->match && (node->status & RE_STATUS_HAS_GROUPS))
                    pop_groups(state);

                state->too_few_errors = bt_data->lookaround.too_few_errors;
@ -16602,13 +16624,14 @@ Py_LOCAL_INLINE(int) do_best_fuzzy_match(RE_SafeState* safe_state, BOOL search)
                 */
                add_to_best_list(safe_state, &best_list, state->match_pos,
                  state->text_pos);
-        }
+        } else
+            start_pos = state->match_pos + step;

        /* Should we keep searching? */
        if (!search)
            break;

-        start_pos = state->match_pos + step;
+        state->max_errors = fewest_errors - 1;
    }

    if (found_match) {
@ -17103,7 +17126,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
     * instead.
     */
    if (PyUnicode_Check(string)) {
-        /* Unicode strings doesn't always support the buffer interface. */
+        /* Unicode strings don't always support the buffer interface. */
        str_info->characters = (void*)PyUnicode_AS_DATA(string);
        str_info->length = PyUnicode_GET_SIZE(string);
        str_info->charsize = sizeof(Py_UNICODE);
@ -17112,6 +17135,39 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
        return TRUE;
    }

+#if defined(PYPY_VERSION)
+    if (PyString_Check(string)) {
+        /* Bytestrings don't always support the buffer interface. */
+        str_info->characters = (void*)PyString_AS_STRING(string);
+        str_info->length = PyString_GET_SIZE(string);
+        str_info->charsize = 1;
+        str_info->is_unicode = FALSE;
+        str_info->should_release = FALSE;
+        return TRUE;
+    }
+
+#endif
+#if defined(PYPY_VERSION)
+    /* Get pointer to string buffer. */
+    if (PyObject_GetBuffer(string, &str_info->view, PyBUF_SIMPLE) != 0) {
+        printf("PyObject_GetBuffer failed!\n");
+        PyErr_SetString(PyExc_TypeError, "expected string or buffer");
+        return FALSE;
+    }
+
+    if (!str_info->view.buf) {
+        PyBuffer_Release(&str_info->view);
+        PyErr_SetString(PyExc_ValueError, "buffer is NULL");
+        return FALSE;
+    }
+
+    str_info->should_release = TRUE;
+
+    str_info->characters = str_info->view.buf;
+    str_info->length = str_info->view.len;
+    str_info->charsize = 1;
+    str_info->is_unicode = FALSE;
+#else
    /* Get pointer to string buffer. */
 #if PY_VERSION_HEX >= 0x02060000
    buffer = Py_TYPE(string)->tp_as_buffer;
@ -17183,6 +17239,7 @@ Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {

    str_info->length = size;
    str_info->is_unicode = FALSE;
+#endif

    return TRUE;
 }
--- a/src/regex/_regex_core.py
+++ b/src/regex/_regex_core.py
@ -14,7 +14,6 @@
 # 2010-01-16 mrab Python front-end re-written and extended

 import string
-import sys
 import unicodedata
 from collections import defaultdict

@ -23,7 +22,6 @@ _regex = plugins['_regex'][0]
 if _regex is None:
    raise RuntimeError('Failed to load regex module with error: ' + plugins['_regex'][1])

-
 __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
  "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
  "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
@ -2842,6 +2840,9 @@ class GreedyRepeat(RegexBase):
    def is_atomic(self):
        return self.min_count == self.max_count and self.subpattern.is_atomic()

+    def can_be_affix(self):
+        return False
+
    def contains_group(self):
        return self.subpattern.contains_group()

@ -3114,7 +3115,7 @@ class LookAroundConditional(RegexBase):
        print("%sEITHER" % (INDENT * indent))
        self.yes_item.dump(indent + 1, reverse)
        if not self.no_item.is_empty():
-            print("%sOR".format(INDENT * indent))
+            print("%sOR" % (INDENT * indent))
            self.no_item.dump(indent + 1, reverse)

    def is_empty(self):