Use the more powerful regex engine for book list and metadata from filename

Fixes #1893371 [regex [[\w--[A-Z]] does not work](https://bugs.launchpad.net/calibre/+bug/1893371)
2025-12-09 14:45:01 -05:00 · 2020-08-29 10:06:06 +05:30 · 2020-08-29 10:06:06 +05:30 · 9f1e1e5a18
commit 9f1e1e5a18
parent 1b5bfd9078
5 changed files with 32 additions and 24 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -5,6 +5,14 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.
 # changes for 5
 # viewer supports annotations
 # viewer works with RTL and vertical text
 # python upgraded to python 3 link to list of not ported plugins
 # regex engine used for searching book list and metadata from file names made more powerful
 # dark mode support in the content server and viewer UIs
 # content server viewer can now browse and create bookmarks
 - version: 4.23.0
  date: 2020-08-21
--- a/manual/regexp.rst
+++ b/manual/regexp.rst
@ -22,7 +22,9 @@ There are a few places calibre uses regular expressions. There's the
 :guilabel:`Search & replace` in conversion options, metadata detection from filenames in the import
 settings and Search & replace when editing the metadata of books in bulk. The
 calibre book editor can also use regular expressions in its search and replace
-feature.
+feature. Finally, you can use regular expressions when searching the calibre
 book list and when searching inside the calibre viewer.
 What on earth *is* a regular expression?
 ------------------------------------------------
--- a/manual/regexp_quick_reference.rst
+++ b/manual/regexp_quick_reference.rst
@ -2,9 +2,7 @@ Quick reference for regexp syntax
 =================================================
 This checklist summarizes the most commonly used/hard to remember parts of the
-regexp engine available in the calibre edit and conversion search/replace
+regexp engine available in most parts of calibre.
 features. Note that this engine is more powerful than the basic regexp engine
 used throughout the rest of calibre.
 .. contents:: Contents
  :depth: 2
@ -334,4 +332,3 @@ Modes
    ``(?m)``
        Makes the ``^`` and ``$`` anchors match the start and end of lines
        instead of the start and end of the entire string.
--- a/src/calibre/db/search.py
+++ b/src/calibre/db/search.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import re, weakref, operator
+import regex, weakref, operator
 from functools import partial
 from datetime import timedelta
 from collections import deque, OrderedDict
@ -72,7 +72,8 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
                elif query == t:
                    return True
            elif matchkind == REGEXP_MATCH:
-                if re.search(query, t, re.UNICODE if case_sensitive else re.I|re.UNICODE):
+                flags = regex.UNICODE | regex.VERSION1 | (0 if case_sensitive else regex.IGNORECASE)
                if regex.search(query, t, flags) is not None:
                    return True
            elif matchkind == CONTAINS_MATCH:
                if not case_sensitive and use_primary_find_in_search:
@ -80,7 +81,7 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
                        return True
                elif query in t:
                    return True
-        except re.error:
+        except regex.error:
            pass
    return False
 # }}}
@ -100,7 +101,7 @@ class DateSearch(object):  # {{{
        self.local_today         = {'_today', 'today', icu_lower(_('today'))}
        self.local_yesterday     = {'_yesterday', 'yesterday', icu_lower(_('yesterday'))}
        self.local_thismonth     = {'_thismonth', 'thismonth', icu_lower(_('thismonth'))}
-        self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago'))
+        self.daysago_pat = regex.compile(r'(%s|daysago|_daysago)$'%_('daysago'), flags=regex.UNICODE | regex.VERSION1)
    def eq(self, dbdate, query, field_count):
        if dbdate.year == query.year:
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@ -3,7 +3,7 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import os, re, collections
+import os, regex, collections
 from calibre.utils.config import prefs
 from calibre.constants import filesystem_encoding
@ -105,8 +105,8 @@ def _get_metadata(stream, stream_type, use_libprs_metadata,
    name = os.path.basename(getattr(stream, 'name', ''))
    # The fallback pattern matches the default filename format produced by calibre
-    base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
+    base = metadata_from_filename(name, pat=pattern, fallback_pat=regex.compile(
-            r'^(?P<title>.+) - (?P<author>[^-]+)$'))
+            r'^(?P<title>.+) - (?P<author>[^-]+)$', flags=regex.UNICODE | regex.VERSION1))
    if not base.authors:
        base.authors = [_('Unknown')]
    if not base.title:
@ -133,7 +133,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None):
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
-        pat = re.compile(prefs.get('filename_pattern'))
+        pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION1)
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None: