Use the more powerful regex engine for book list and metadata from filename

Fixes #1893371 [regex [[\w--[A-Z]] does not work](https://bugs.launchpad.net/calibre/+bug/1893371)
This commit is contained in:
Kovid Goyal 2020-08-29 10:06:06 +05:30
parent 1b5bfd9078
commit 9f1e1e5a18
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 32 additions and 24 deletions

View File

@ -5,6 +5,14 @@
# for important features/bug fixes. # for important features/bug fixes.
# Also, each release can have new and improved recipes. # Also, each release can have new and improved recipes.
# changes for 5
# viewer supports annotations
# viewer works with RTL and vertical text
# python upgraded to python 3 link to list of not ported plugins
# regex engine used for searching book list and metadata from file names made more powerful
# dark mode support in the content server and viewer UIs
# content server viewer can now browse and create bookmarks
- version: 4.23.0 - version: 4.23.0
date: 2020-08-21 date: 2020-08-21

View File

@ -22,7 +22,9 @@ There are a few places calibre uses regular expressions. There's the
:guilabel:`Search & replace` in conversion options, metadata detection from filenames in the import :guilabel:`Search & replace` in conversion options, metadata detection from filenames in the import
settings and Search & replace when editing the metadata of books in bulk. The settings and Search & replace when editing the metadata of books in bulk. The
calibre book editor can also use regular expressions in its search and replace calibre book editor can also use regular expressions in its search and replace
feature. feature. Finally, you can use regular expressions when searching the calibre
book list and when searching inside the calibre viewer.
What on earth *is* a regular expression? What on earth *is* a regular expression?
------------------------------------------------ ------------------------------------------------

View File

@ -2,9 +2,7 @@ Quick reference for regexp syntax
================================================= =================================================
This checklist summarizes the most commonly used/hard to remember parts of the This checklist summarizes the most commonly used/hard to remember parts of the
regexp engine available in the calibre edit and conversion search/replace regexp engine available in most parts of calibre.
features. Note that this engine is more powerful than the basic regexp engine
used throughout the rest of calibre.
.. contents:: Contents .. contents:: Contents
:depth: 2 :depth: 2
@ -334,4 +332,3 @@ Modes
``(?m)`` ``(?m)``
Makes the ``^`` and ``$`` anchors match the start and end of lines Makes the ``^`` and ``$`` anchors match the start and end of lines
instead of the start and end of the entire string. instead of the start and end of the entire string.

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, weakref, operator import regex, weakref, operator
from functools import partial from functools import partial
from datetime import timedelta from datetime import timedelta
from collections import deque, OrderedDict from collections import deque, OrderedDict
@ -72,7 +72,8 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
elif query == t: elif query == t:
return True return True
elif matchkind == REGEXP_MATCH: elif matchkind == REGEXP_MATCH:
if re.search(query, t, re.UNICODE if case_sensitive else re.I|re.UNICODE): flags = regex.UNICODE | regex.VERSION1 | (0 if case_sensitive else regex.IGNORECASE)
if regex.search(query, t, flags) is not None:
return True return True
elif matchkind == CONTAINS_MATCH: elif matchkind == CONTAINS_MATCH:
if not case_sensitive and use_primary_find_in_search: if not case_sensitive and use_primary_find_in_search:
@ -80,7 +81,7 @@ def _match(query, value, matchkind, use_primary_find_in_search=True, case_sensit
return True return True
elif query in t: elif query in t:
return True return True
except re.error: except regex.error:
pass pass
return False return False
# }}} # }}}
@ -100,7 +101,7 @@ class DateSearch(object): # {{{
self.local_today = {'_today', 'today', icu_lower(_('today'))} self.local_today = {'_today', 'today', icu_lower(_('today'))}
self.local_yesterday = {'_yesterday', 'yesterday', icu_lower(_('yesterday'))} self.local_yesterday = {'_yesterday', 'yesterday', icu_lower(_('yesterday'))}
self.local_thismonth = {'_thismonth', 'thismonth', icu_lower(_('thismonth'))} self.local_thismonth = {'_thismonth', 'thismonth', icu_lower(_('thismonth'))}
self.daysago_pat = re.compile(r'(%s|daysago|_daysago)$'%_('daysago')) self.daysago_pat = regex.compile(r'(%s|daysago|_daysago)$'%_('daysago'), flags=regex.UNICODE | regex.VERSION1)
def eq(self, dbdate, query, field_count): def eq(self, dbdate, query, field_count):
if dbdate.year == query.year: if dbdate.year == query.year:

View File

@ -3,7 +3,7 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re, collections import os, regex, collections
from calibre.utils.config import prefs from calibre.utils.config import prefs
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
@ -105,8 +105,8 @@ def _get_metadata(stream, stream_type, use_libprs_metadata,
name = os.path.basename(getattr(stream, 'name', '')) name = os.path.basename(getattr(stream, 'name', ''))
# The fallback pattern matches the default filename format produced by calibre # The fallback pattern matches the default filename format produced by calibre
base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile( base = metadata_from_filename(name, pat=pattern, fallback_pat=regex.compile(
r'^(?P<title>.+) - (?P<author>[^-]+)$')) r'^(?P<title>.+) - (?P<author>[^-]+)$', flags=regex.UNICODE | regex.VERSION1))
if not base.authors: if not base.authors:
base.authors = [_('Unknown')] base.authors = [_('Unknown')]
if not base.title: if not base.title:
@ -133,7 +133,7 @@ def metadata_from_filename(name, pat=None, fallback_pat=None):
name = name.rpartition('.')[0] name = name.rpartition('.')[0]
mi = MetaInformation(None, None) mi = MetaInformation(None, None)
if pat is None: if pat is None:
pat = re.compile(prefs.get('filename_pattern')) pat = regex.compile(prefs.get('filename_pattern'), flags=regex.UNICODE | regex.VERSION1)
name = name.replace('_', ' ') name = name.replace('_', ' ')
match = pat.search(name) match = pat.search(name)
if match is None and fallback_pat is not None: if match is None and fallback_pat is not None: