Merge branch 'master' of https://github.com/cbhaley/calibre

2025-08-30 23:00:21 -04:00 · 2022-07-14 17:42:54 +05:30 · 2022-07-14 17:42:54 +05:30 · 77bdfee72a
commit 77bdfee72a
parent 115f6ef9b1 58328a5beb
4 changed files with 49 additions and 7 deletions
--- a/manual/gui.rst
+++ b/manual/gui.rst
@ -386,7 +386,7 @@ Two variants of equality searches are used for hierarchical items (e.g., A.B.C):

 *'Regular expression' searches*

-Regular expression searches are indicated by prefixing the search string with a tilde (~). Any `Python-compatible regular expression <https://docs.python.org/library/re.html>`__ can be used. Backslashes used to escape special characters in regular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(``. Regular expression searches are 'contains' searches unless the expression is anchored. Character variants are significant: ``~e`` doesn't match ``é``.
+Regular expression searches are indicated by prefixing the search string with a tilde (~). Any `Python-compatible regular expression <https://docs.python.org/library/re.html>`__ can be used. Backslashes used to escape special characters in regular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(`` or alternatively use `super quotes` (see below). Regular expression searches are 'contains' searches unless the expression is anchored. Character variants are significant: ``~e`` doesn't match ``é``.

 *'Character variant' searches*

@ -414,6 +414,21 @@ then these character variant searches find:
  * ``title:"^db"`` matches nothing
  * ``title:"^,"`` matches #1 (instead of all books) because the comma is significant

+*Search Expression Syntax*
+
+A `search expression` is a sequence of `search terms` optionally separated by the operators ``and`` and ``or``. If two search terms occur without a separating operator, ``and`` is assumed. The ``and`` operator has priority over the ``or`` operator; for example the expression ``a or b and c`` is the same as ``a or (b and c)``. You can use parenthesis to change the priority; for example ``(a or b) and c`` to make the ``or`` evaluate before the ``and``. You can use the operator ``not`` to negate (invert) the result of evaluating a search expression. Examples:
+
+  * ``not tag:foo`` finds all books that don't contain the tag ``foo``
+  * ``not (author:Asimov or author:Weber)`` finds all books not written by either Asimov or Weber.
+
+The above examples show examples of `search terms`. A basic `search term` is a sequence of characters not including spaces, quotes (``"``), backslashes (``\``), or parentheses (``( )``). It can be optionally preceeded by a column name specifier: the `lookup name` of a column followed by a colon (``:``), for example ``author:Asimov``. If a search term must contain a space then the entire term must be enclosed in quotes, as in ``title:"The Ring"``. If the search term must contain quotes then they must be `escaped` with backslashes. For example, to search for a series named `The "Ball" and The "Chain"`, use::
+
+  series:"The \"Ball\" and The \"Chain\"
+
+If you need an actual backslash, something that happens frequently in `regular expression` searches, use two of them (``\\``).
+
+It is sometimes hard to get all the escapes right so the result is what you want, especially in `regular expression` and `template` searches. In these cases use the `super-quote`: ``"""sequence of characters"""``. Super-quoted characters are used unchanged: no escape processing is done.
+
 *More information*

 To search for a string that begins with an equals, tilde, or caret; prefix the string with a backslash.
@ -524,6 +539,7 @@ Examples:
  * ``template:"program: connected_device_name('main')#@#:t:kindle"`` -- is true when the ``kindle`` device is connected.
  * ``template:"program: select(formats_sizes(), 'EPUB')#@#:n:>1000000"`` -- finds books with EPUB files larger than 1 MB.
  * ``template:"program: select(formats_modtimes('iso'), 'EPUB')#@#:d:>10daysago"`` -- finds books with EPUB files newer than 10 days ago.
+  * ``template:"""program: book_count('tags:^"' & $series & '"', 0) != 0#@#:n:1"""`` -- finds all books containing the series name in the tags. This example uses super-quoting because the template uses both single quotes (``'``) and double quotes (``"``) when constructing the search expression.

 You can build template search queries easily using the :guilabel:`Advanced search dialog` accessed by clicking the button |sbi|. You can test templates on specific books using the calibre :guilabel:`Template tester`, which can be added to the toolbars or menus via :guilabel:`Preferences->Toolbars & menus`. It can also be assigned a keyboard shortcut via :guilabel:`Preferences->Shortcuts`.

--- a/src/calibre/gui2/dialogs/search.py
+++ b/src/calibre/gui2/dialogs/search.py
@ -324,6 +324,14 @@ class SearchDialog(QDialog):
        QDialog.__init__(self, parent)
        setup_ui(self, db)

+        # Get metadata of some of the selected books to give to the template
+        # dialog to help test the template
+        from calibre.gui2.ui import get_gui
+        view = get_gui().library_view
+        rows = view.selectionModel().selectedRows()[0:10] # Maximum of 10 books
+        mi = [db.new_api.get_proxy_metadata(db.data.index_to_id(x.row())) for x in rows]
+        self.template_program_box.set_mi(mi)
+
        current_tab = gprefs.get('advanced search dialog current tab', 0)
        self.tab_widget.setCurrentIndex(current_tab)
        if current_tab == 1:
@ -393,12 +401,13 @@ class SearchDialog(QDialog):

    def template_search_string(self):
        template = str(self.template_program_box.text())
-        value = str(self.template_value_box.text()).replace('"', '\\"')
+        value = str(self.template_value_box.text())
        if template and value:
            cb = self.template_test_type_box
            op =  str(cb.itemData(cb.currentIndex()))
            l = f'{template}#@#:{op}:{value}'
-            return 'template:"' + l + '"'
+            # Use docstring quoting (super-quoting) to avoid problems with escaping
+            return 'template:"""' + l + '"""'
        return ''

    def date_search_string(self):
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@ -22,6 +22,7 @@ import weakref, re
 from calibre.constants import preferred_encoding
 from calibre.utils.icu import sort_key
 from calibre import prints
+from polyglot.binary import as_hex_unicode, from_hex_unicode
 from polyglot.builtins import codepoint_to_chr


@ -150,6 +151,9 @@ class Parser:
    EOF = 4
    REPLACEMENTS = tuple(('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()'))

+    # the sep must be a printable character sequence that won't actually appear naturally
+    docstring_sep = '□ༀ؆' # Unicode white square, Tibetian Om, Arabic-Indic Cube Root
+
    # Had to translate named constants to numeric values
    lex_scanner = re.Scanner([
            (r'[()]', lambda x,t: (Parser.OPCODE, t)),
@ -187,6 +191,11 @@ class Parser:
        self.current_token += 1

    def tokenize(self, expr):
+        # convert docstrings to base64 to avoid all processing. Change the docstring
+        # indicator to something unique with no characters special to the parser.
+        expr = re.sub('(""")(..*?)(""")',
+                  lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep, expr)
+
        # Strip out escaped backslashes, quotes and parens so that the
        # lex scanner doesn't get confused. We put them back later.
        for k, v in self.REPLACEMENTS:
@ -194,14 +203,14 @@ class Parser:
        tokens = self.lex_scanner.scan(expr)[0]

        def unescape(x):
+            # recover the docstrings
+            x = re.sub(f'({self.docstring_sep})(..*?)({self.docstring_sep})',
+                       lambda mo: from_hex_unicode(mo.group(2)), x)
            for k, v in self.REPLACEMENTS:
                x = x.replace(v, k[1:])
            return x

-        return [
-            (tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv)
-            for tt, tv in tokens
-        ]
+        return [(tt, unescape(tv)) for tt, tv in tokens]

    def parse(self, expr, locations):
        self.locations = locations
--- a/src/calibre/utils/search_query_parser_test.py
+++ b/src/calibre/utils/search_query_parser_test.py
@ -386,6 +386,14 @@ class TestSQP(unittest.TestCase):
        t('"a \\" () b"', 'Q', 'a " () b')
        t('"a“b"', 'Q', 'a“b')
        t('"a”b"', 'Q', 'a”b')
+        # docstring tests
+        t(r'"""a\1b"""', 'W', r'a\1b')
+        t(r'("""a\1b""" AND """c""" OR d)',
+          'O', '(', 'W', r'a\1b', 'W', 'AND', 'W', 'c',  'W', 'OR', 'W', 'd', 'O', ')')
+        t(r'template:="""a\1b"""', 'W', r'template:=a\1b')
+        t(r'template:"""=a\1b"""', 'W', r'template:=a\1b')
+        t(r'template:"""program: return ("\"1\"")#@#n:1"""', 'W',
+          r'template:program: return ("\"1\"")#@#n:1')


 def find_tests():