From 9f1e1e5a183a99a166630e3964880e7991c8c7cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 29 Aug 2020 10:06:06 +0530 Subject: [PATCH] Use the more powerful regex engine for book list and metadata from filename Fixes #1893371 [regex [[\w--[A-Z]] does not work](https://bugs.launchpad.net/calibre/+bug/1893371) --- Changelog.yaml | 8 ++++++++ manual/regexp.rst | 4 +++- manual/regexp_quick_reference.rst | 27 ++++++++++++--------------- src/calibre/db/search.py | 9 +++++---- src/calibre/ebooks/metadata/meta.py | 8 ++++---- 5 files changed, 32 insertions(+), 24 deletions(-) diff --git a/Changelog.yaml b/Changelog.yaml index d38d1fa1fb..d969356c0f 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -5,6 +5,14 @@ # for important features/bug fixes. # Also, each release can have new and improved recipes. +# changes for 5 +# viewer supports annotations +# viewer works with RTL and vertical text +# python upgraded to python 3 link to list of not ported plugins +# regex engine used for searching book list and metadata from file names made more powerful +# dark mode support in the content server and viewer UIs +# content server viewer can now browse and create bookmarks + - version: 4.23.0 date: 2020-08-21 diff --git a/manual/regexp.rst b/manual/regexp.rst index ec1ff47390..089793dc7b 100644 --- a/manual/regexp.rst +++ b/manual/regexp.rst @@ -22,7 +22,9 @@ There are a few places calibre uses regular expressions. There's the :guilabel:`Search & replace` in conversion options, metadata detection from filenames in the import settings and Search & replace when editing the metadata of books in bulk. The calibre book editor can also use regular expressions in its search and replace -feature. +feature. Finally, you can use regular expressions when searching the calibre +book list and when searching inside the calibre viewer. + What on earth *is* a regular expression? ------------------------------------------------ diff --git a/manual/regexp_quick_reference.rst b/manual/regexp_quick_reference.rst index ac21289d34..b4e2370d4d 100644 --- a/manual/regexp_quick_reference.rst +++ b/manual/regexp_quick_reference.rst @@ -2,9 +2,7 @@ Quick reference for regexp syntax ================================================= This checklist summarizes the most commonly used/hard to remember parts of the -regexp engine available in the calibre edit and conversion search/replace -features. Note that this engine is more powerful than the basic regexp engine -used throughout the rest of calibre. +regexp engine available in most parts of calibre. .. contents:: Contents :depth: 2 @@ -173,25 +171,25 @@ character. The most useful anchors for text processing are: Groups ------ - ``(expression)`` + ``(expression)`` Capturing group, which stores the selection and can be recalled later in the *search* or *replace* patterns with ``\n``, where ``n`` is the - sequence number of the capturing group (starting at 1 in reading order) + sequence number of the capturing group (starting at 1 in reading order) - ``(?:expression)`` + ``(?:expression)`` Group that does not capture the selection - ``(?>expression)`` + ``(?>expression)`` Atomic Group: As soon as the expression is satisfied, the regexp engine passes, and if the rest of the pattern fails, it will not backtrack to try other combinations with the expression. Atomic groups do not - capture. + capture. - ``(?|expression)`` + ``(?|expression)`` Branch reset group: the branches of the alternations included in the expression share the same group numbers - - ``(?expression)`` + + ``(?expression)`` Group named “name”. The selection can be recalled later in the *search* pattern by ``(?P=name)`` and in the *replace* by ``\g``. Two different groups can use the same name. @@ -220,7 +218,7 @@ Lookarounds Lookaheads and lookbehinds do not consume characters, they are zero length and do not capture. They are atomic groups: as soon as the assertion is satisfied, the regexp engine passes, and if the rest of the pattern fails, it will not -backtrack inside the lookaround to try other combinations. +backtrack inside the lookaround to try other combinations. When looking for multiple matches in a string, at the starting position of each match attempt, a lookbehind can inspect the characters before the current @@ -230,7 +228,7 @@ only select 2, because the starting position after the first selection is immediately before 3, and there are not enough digits for a second match. Similarly, ``\d(\d)`` only captures 2. In calibre's regexp engine practice, the positive lookbehind behaves in the same way, and selects only 2, contrary to -theory. +theory. Groups can be placed inside lookarounds, but capture is rarely useful. Nevertheless, if it is useful, it will be necessary to be very careful in the @@ -275,7 +273,7 @@ To select a string between double quotation marks without stopping on an embedde “((?>[^“”]+|(?R))*[^“”]+)” This template can also be used to modify pairs of tags that can be -embedded, such as ``
`` tags. +embedded, such as ``
`` tags. Special characters @@ -334,4 +332,3 @@ Modes ``(?m)`` Makes the ``^`` and ``$`` anchors match the start and end of lines instead of the start and end of the entire string. - diff --git a/src/calibre/db/search.py b/src/calibre/db/search.py index 336e46e78a..a11c416217 100644 --- a/src/calibre/db/search.py +++ b/src/calibre/db/search.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re, weakref, operator +import regex, weakref, operator from functools import partial from datetime import timedelta from collections import deque, OrderedDict @@ -72,7 +72,8 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit elif query == t: return True elif matchkind == REGEXP_MATCH: - if re.search(query, t, re.UNICODE if case_sensitive else re.I|re.UNICODE): + flags = regex.UNICODE | regex.VERSION1 | (0 if case_sensitive else regex.IGNORECASE) + if regex.search(query, t, flags) is not None: return True elif matchkind == CONTAINS_MATCH: if not case_sensitive and use_primary_find_in_search: @@ -80,7 +81,7 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit return True elif query in t: return True - except re.error: + except regex.error: pass return False # }}} @@ -100,7 +101,7 @@ class DateSearch(object): # {{{ self.local_today = {'_today', 'today', icu_lower(_('today'))} self.local_yesterday = {'_yesterday', 'yesterday', icu_lower(_('yesterday'))} self.local_thismonth = {'_thismonth', 'thismonth', icu_lower(_('thismonth'))} - self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago')) + self.daysago_pat = regex.compile(r'(%s|daysago|_daysago)$'%_('daysago'), flags=regex.UNICODE | regex.VERSION1) def eq(self, dbdate, query, field_count): if dbdate.year == query.year: diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 8a1e1fa512..fd11e9bd52 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -3,7 +3,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, re, collections +import os, regex, collections from calibre.utils.config import prefs from calibre.constants import filesystem_encoding @@ -105,8 +105,8 @@ def _get_metadata(stream, stream_type, use_libprs_metadata, name = os.path.basename(getattr(stream, 'name', '')) # The fallback pattern matches the default filename format produced by calibre - base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile( - r'^(?P.+) - (?P<author>[^-]+)$')) + base = metadata_from_filename(name, pat=pattern, fallback_pat=regex.compile( + r'^(?P<title>.+) - (?P<author>[^-]+)$', flags=regex.UNICODE | regex.VERSION1)) if not base.authors: base.authors = [_('Unknown')] if not base.title: @@ -133,7 +133,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None): name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: - pat = re.compile(prefs.get('filename_pattern')) + pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION1) name = name.replace('_', ' ') match = pat.search(name) if match is None and fallback_pat is not None: