From 58328a5bebc6945ed16675a0a3c2979b2f800363 Mon Sep 17 00:00:00 2001
From: Charles Haley <cbhaley@i.wont.say.com>
Date: Thu, 14 Jul 2022 12:32:46 +0100
Subject: [PATCH] Several interlocking changes: 1) Add a docstring syntax to
 the search query parser. """text""" uses text unchanged without escape
 processing. 2) Add tests for docstrings. 3) Update the search section of the
 manual. 4) Change the advanced search dialog to use docstrings for template
 searches. 5) Pass some books to the template dialog in advanced search to
 help with debugging template searches.

---
 manual/gui.rst                                | 18 +++++++++++++++++-
 src/calibre/gui2/dialogs/search.py            | 13 +++++++++++--
 src/calibre/utils/search_query_parser.py      | 17 +++++++++++++----
 src/calibre/utils/search_query_parser_test.py |  8 ++++++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/manual/gui.rst b/manual/gui.rst
index be1edd8fc9..78d768f451 100644
--- a/manual/gui.rst
+++ b/manual/gui.rst
@@ -386,7 +386,7 @@ Two variants of equality searches are used for hierarchical items (e.g., A.B.C):
 
 *'Regular expression' searches*
 
-Regular expression searches are indicated by prefixing the search string with a tilde (~). Any `Python-compatible regular expression <https://docs.python.org/library/re.html>`__ can be used. Backslashes used to escape special characters in regular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(``. Regular expression searches are 'contains' searches unless the expression is anchored. Character variants are significant: ``~e`` doesn't match ``é``.
+Regular expression searches are indicated by prefixing the search string with a tilde (~). Any `Python-compatible regular expression <https://docs.python.org/library/re.html>`__ can be used. Backslashes used to escape special characters in regular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(`` or alternatively use `super quotes` (see below). Regular expression searches are 'contains' searches unless the expression is anchored. Character variants are significant: ``~e`` doesn't match ``é``.
 
 *'Character variant' searches*
 
@@ -414,6 +414,21 @@ then these character variant searches find:
   * ``title:"^db"`` matches nothing
   * ``title:"^,"`` matches #1 (instead of all books) because the comma is significant
 
+*Search Expression Syntax*
+
+A `search expression` is a sequence of `search terms` optionally separated by the operators ``and`` and ``or``. If two search terms occur without a separating operator, ``and`` is assumed. The ``and`` operator has priority over the ``or`` operator; for example the expression ``a or b and c`` is the same as ``a or (b and c)``. You can use parenthesis to change the priority; for example ``(a or b) and c`` to make the ``or`` evaluate before the ``and``. You can use the operator ``not`` to negate (invert) the result of evaluating a search expression. Examples:
+
+  * ``not tag:foo`` finds all books that don't contain the tag ``foo``
+  * ``not (author:Asimov or author:Weber)`` finds all books not written by either Asimov or Weber.
+
+The above examples show examples of `search terms`. A basic `search term` is a sequence of characters not including spaces, quotes (``"``), backslashes (``\``), or parentheses (``( )``). It can be optionally preceeded by a column name specifier: the `lookup name` of a column followed by a colon (``:``), for example ``author:Asimov``. If a search term must contain a space then the entire term must be enclosed in quotes, as in ``title:"The Ring"``. If the search term must contain quotes then they must be `escaped` with backslashes. For example, to search for a series named `The "Ball" and The "Chain"`, use::
+
+  series:"The \"Ball\" and The \"Chain\"
+
+If you need an actual backslash, something that happens frequently in `regular expression` searches, use two of them (``\\``).
+
+It is sometimes hard to get all the escapes right so the result is what you want, especially in `regular expression` and `template` searches. In these cases use the `super-quote`: ``"""sequence of characters"""``. Super-quoted characters are used unchanged: no escape processing is done.
+
 *More information*
 
 To search for a string that begins with an equals, tilde, or caret; prefix the string with a backslash.
@@ -524,6 +539,7 @@ Examples:
   * ``template:"program: connected_device_name('main')#@#:t:kindle"`` -- is true when the ``kindle`` device is connected.
   * ``template:"program: select(formats_sizes(), 'EPUB')#@#:n:>1000000"`` -- finds books with EPUB files larger than 1 MB.
   * ``template:"program: select(formats_modtimes('iso'), 'EPUB')#@#:d:>10daysago"`` -- finds books with EPUB files newer than 10 days ago.
+  * ``template:"""program: book_count('tags:^"' & $series & '"', 0) != 0#@#:n:1"""`` -- finds all books containing the series name in the tags. This example uses super-quoting because the template uses both single quotes (``'``) and double quotes (``"``) when constructing the search expression.
 
 You can build template search queries easily using the :guilabel:`Advanced search dialog` accessed by clicking the button |sbi|. You can test templates on specific books using the calibre :guilabel:`Template tester`, which can be added to the toolbars or menus via :guilabel:`Preferences->Toolbars & menus`. It can also be assigned a keyboard shortcut via :guilabel:`Preferences->Shortcuts`.
 
diff --git a/src/calibre/gui2/dialogs/search.py b/src/calibre/gui2/dialogs/search.py
index 35a473cdfb..bc026e7379 100644
--- a/src/calibre/gui2/dialogs/search.py
+++ b/src/calibre/gui2/dialogs/search.py
@@ -324,6 +324,14 @@ class SearchDialog(QDialog):
         QDialog.__init__(self, parent)
         setup_ui(self, db)
 
+        # Get metadata of some of the selected books to give to the template
+        # dialog to help test the template
+        from calibre.gui2.ui import get_gui
+        view = get_gui().library_view
+        rows = view.selectionModel().selectedRows()[0:10] # Maximum of 10 books
+        mi = [db.new_api.get_proxy_metadata(db.data.index_to_id(x.row())) for x in rows]
+        self.template_program_box.set_mi(mi)
+
         current_tab = gprefs.get('advanced search dialog current tab', 0)
         self.tab_widget.setCurrentIndex(current_tab)
         if current_tab == 1:
@@ -393,12 +401,13 @@ class SearchDialog(QDialog):
 
     def template_search_string(self):
         template = str(self.template_program_box.text())
-        value = str(self.template_value_box.text()).replace('"', '\\"')
+        value = str(self.template_value_box.text())
         if template and value:
             cb = self.template_test_type_box
             op =  str(cb.itemData(cb.currentIndex()))
             l = f'{template}#@#:{op}:{value}'
-            return 'template:"' + l + '"'
+            # Use docstring quoting (super-quoting) to avoid problems with escaping
+            return 'template:"""' + l + '"""'
         return ''
 
     def date_search_string(self):
diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index c8b6442244..bac513a1e4 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -22,6 +22,7 @@ import weakref, re
 from calibre.constants import preferred_encoding
 from calibre.utils.icu import sort_key
 from calibre import prints
+from polyglot.binary import as_hex_unicode, from_hex_unicode
 from polyglot.builtins import codepoint_to_chr
 
 
@@ -150,6 +151,9 @@ class Parser:
     EOF = 4
     REPLACEMENTS = tuple(('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()'))
 
+    # the sep must be a printable character sequence that won't actually appear naturally
+    docstring_sep = '□ༀ؆' # Unicode white square, Tibetian Om, Arabic-Indic Cube Root
+
     # Had to translate named constants to numeric values
     lex_scanner = re.Scanner([
             (r'[()]', lambda x,t: (Parser.OPCODE, t)),
@@ -187,6 +191,11 @@ class Parser:
         self.current_token += 1
 
     def tokenize(self, expr):
+        # convert docstrings to base64 to avoid all processing. Change the docstring
+        # indicator to something unique with no characters special to the parser.
+        expr = re.sub('(""")(..*?)(""")',
+                  lambda mo: self.docstring_sep + as_hex_unicode(mo.group(2)) + self.docstring_sep, expr)
+
         # Strip out escaped backslashes, quotes and parens so that the
         # lex scanner doesn't get confused. We put them back later.
         for k, v in self.REPLACEMENTS:
@@ -194,14 +203,14 @@ class Parser:
         tokens = self.lex_scanner.scan(expr)[0]
 
         def unescape(x):
+            # recover the docstrings
+            x = re.sub(f'({self.docstring_sep})(..*?)({self.docstring_sep})',
+                       lambda mo: from_hex_unicode(mo.group(2)), x)
             for k, v in self.REPLACEMENTS:
                 x = x.replace(v, k[1:])
             return x
 
-        return [
-            (tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv)
-            for tt, tv in tokens
-        ]
+        return [(tt, unescape(tv)) for tt, tv in tokens]
 
     def parse(self, expr, locations):
         self.locations = locations
diff --git a/src/calibre/utils/search_query_parser_test.py b/src/calibre/utils/search_query_parser_test.py
index abcaa4ddf1..2097fddc40 100644
--- a/src/calibre/utils/search_query_parser_test.py
+++ b/src/calibre/utils/search_query_parser_test.py
@@ -386,6 +386,14 @@ class TestSQP(unittest.TestCase):
         t('"a \\" () b"', 'Q', 'a " () b')
         t('"a“b"', 'Q', 'a“b')
         t('"a”b"', 'Q', 'a”b')
+        # docstring tests
+        t(r'"""a\1b"""', 'W', r'a\1b')
+        t(r'("""a\1b""" AND """c""" OR d)',
+          'O', '(', 'W', r'a\1b', 'W', 'AND', 'W', 'c',  'W', 'OR', 'W', 'd', 'O', ')')
+        t(r'template:="""a\1b"""', 'W', r'template:=a\1b')
+        t(r'template:"""=a\1b"""', 'W', r'template:=a\1b')
+        t(r'template:"""program: return ("\"1\"")#@#n:1"""', 'W',
+          r'template:program: return ("\"1\"")#@#n:1')
 
 
 def find_tests():