Use the more powerful regex engine for book list and metadata from filename

Fixes #1893371 [regex [[\w--[A-Z]] does not work](https://bugs.launchpad.net/calibre/+bug/1893371)
This commit is contained in:
Kovid Goyal 2020-08-29 10:06:06 +05:30
parent 1b5bfd9078
commit 9f1e1e5a18
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 32 additions and 24 deletions

View File

@ -5,6 +5,14 @@
# for important features/bug fixes.
# Also, each release can have new and improved recipes.
# changes for 5
# viewer supports annotations
# viewer works with RTL and vertical text
# python upgraded to python 3 link to list of not ported plugins
# regex engine used for searching book list and metadata from file names made more powerful
# dark mode support in the content server and viewer UIs
# content server viewer can now browse and create bookmarks
- version: 4.23.0
date: 2020-08-21

View File

@ -22,7 +22,9 @@ There are a few places calibre uses regular expressions. There's the
:guilabel:`Search & replace` in conversion options, metadata detection from filenames in the import
settings and Search & replace when editing the metadata of books in bulk. The
calibre book editor can also use regular expressions in its search and replace
feature.
feature. Finally, you can use regular expressions when searching the calibre
book list and when searching inside the calibre viewer.
What on earth *is* a regular expression?
------------------------------------------------

View File

@ -2,9 +2,7 @@ Quick reference for regexp syntax
=================================================
This checklist summarizes the most commonly used/hard to remember parts of the
regexp engine available in the calibre edit and conversion search/replace
features. Note that this engine is more powerful than the basic regexp engine
used throughout the rest of calibre.
regexp engine available in most parts of calibre.
.. contents:: Contents
:depth: 2
@ -334,4 +332,3 @@ Modes
``(?m)``
Makes the ``^`` and ``$`` anchors match the start and end of lines
instead of the start and end of the entire string.

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, weakref, operator
import regex, weakref, operator
from functools import partial
from datetime import timedelta
from collections import deque, OrderedDict
@ -72,7 +72,8 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
elif query == t:
return True
elif matchkind == REGEXP_MATCH:
if re.search(query, t, re.UNICODE if case_sensitive else re.I|re.UNICODE):
flags = regex.UNICODE | regex.VERSION1 | (0 if case_sensitive else regex.IGNORECASE)
if regex.search(query, t, flags) is not None:
return True
elif matchkind == CONTAINS_MATCH:
if not case_sensitive and use_primary_find_in_search:
@ -80,7 +81,7 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
return True
elif query in t:
return True
except re.error:
except regex.error:
pass
return False
# }}}
@ -100,7 +101,7 @@ class DateSearch(object): # {{{
self.local_today = {'_today', 'today', icu_lower(_('today'))}
self.local_yesterday = {'_yesterday', 'yesterday', icu_lower(_('yesterday'))}
self.local_thismonth = {'_thismonth', 'thismonth', icu_lower(_('thismonth'))}
self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago'))
self.daysago_pat = regex.compile(r'(%s|daysago|_daysago)$'%_('daysago'), flags=regex.UNICODE | regex.VERSION1)
def eq(self, dbdate, query, field_count):
if dbdate.year == query.year:

View File

@ -3,7 +3,7 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re, collections
import os, regex, collections
from calibre.utils.config import prefs
from calibre.constants import filesystem_encoding
@ -105,8 +105,8 @@ def _get_metadata(stream, stream_type, use_libprs_metadata,
name = os.path.basename(getattr(stream, 'name', ''))
# The fallback pattern matches the default filename format produced by calibre
base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
r'^(?P<title>.+) - (?P<author>[^-]+)$'))
base = metadata_from_filename(name, pat=pattern, fallback_pat=regex.compile(
r'^(?P<title>.+) - (?P<author>[^-]+)$', flags=regex.UNICODE | regex.VERSION1))
if not base.authors:
base.authors = [_('Unknown')]
if not base.title:
@ -133,7 +133,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None):
name = name.rpartition('.')[0]
mi = MetaInformation(None, None)
if pat is None:
pat = re.compile(prefs.get('filename_pattern'))
pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION1)
name = name.replace('_', ' ')
match = pat.search(name)
if match is None and fallback_pat is not None: