API to export/import notes to HTML

2025-07-09 03:04:10 -04:00 · 2023-09-16 15:52:30 +05:30 · 2023-09-16 15:52:30 +05:30 · 721d13a809
commit 721d13a809
parent 1fb3b9ea28
5 changed files with 116 additions and 2 deletions
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@ -1029,6 +1029,13 @@ class DB:
    def restore_notes(self, report_progress):
        self.notes.restore(self.conn, self.tables, report_progress)

+    def import_note(self, field, item_id, html_file_path):
+        id_val = self.tables[field].id_map[item_id]
+        return self.notes.import_note(self.conn, field, item_id, id_val, html_file_path)
+
+    def export_note(self, field, item_id, dest_dir):
+        return self.notes.export_note(self.conn, field, item_id, dest_dir)
+
    def initialize_fts(self, dbref):
        self.fts = None
        if not self.prefs['fts_enabled']:
--- a/src/calibre/db/cache.py
+++ b/src/calibre/db/cache.py
@ -705,6 +705,14 @@ class Cache:
    def unretire_note_for(self, field, item_id) -> int:
        return self.backend.unretire_note_for(field, item_id)

+    @read_api
+    def export_note(self, field, item_id, dest_dir):
+        return self.backend.export_note(field, item_id, dest_dir)
+
+    @write_api
+    def import_note(self, field, item_id, path_to_html_file):
+        return self.backend.import_note(field, item_id, path_to_html_file)
+
    @write_api  # we need to use write locking as SQLITE gives a locked table error if multiple FTS queries are made at the same time
    def notes_search(
        self,
--- a/src/calibre/db/notes/connect.py
+++ b/src/calibre/db/notes/connect.py
@ -466,3 +466,27 @@ class Notes:
                    errors.append(_('Could not restore item: {} as not present in database').format(f'{field}/{item_id}'))
                    report_progress('', i)
        return errors
+
+    def export_note(self, conn, field_name, item_id, dest_dir):
+        nd = self.get_note_data(conn, field_name, item_id)
+        if nd is None:
+            return ''
+        from .exim import export_note
+        resources = {}
+        for rh in nd['resource_hashes']:
+            p = make_long_path_useable(self.path_for_resource(rh))
+            if os.path.exists(p):
+                for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (rh,)):
+                    resources[rh] = (p, name)
+        return export_note(nd, resources, dest_dir)
+
+    def import_note(self, conn, field_name, item_id, item_value, html_file_path):
+        from .exim import import_note
+        def add_resource(path, name):
+            return self.add_resource(conn, path, name)
+        st = os.stat(html_file_path)
+        doc, searchable_text, resources = import_note(html_file_path, add_resource)
+        return self.set_note(
+            conn, field_name, item_id, item_value, used_resource_hashes=resources, searchable_text=searchable_text,
+            ctime=st.st_ctime, mtime=st.st_mtime
+        )
--- a/src/calibre/db/notes/exim.py
+++ b/src/calibre/db/notes/exim.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2023, Kovid Goyal <kovid at kovidgoyal.net>
+
+import os
+import shutil
+from html5_parser import parse
+from lxml import html
+from urllib.parse import urlparse
+
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.utils.cleantext import clean_xml_chars
+from calibre.utils.filenames import get_long_path_name, make_long_path_useable
+from calibre.utils.html2text import html2text
+
+from .connect import DOC_NAME, RESOURCE_URL_SCHEME
+
+
+def parse_html(raw):
+    try:
+        return parse(raw, maybe_xhtml=False, sanitize_names=True)
+    except Exception:
+        return parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True)
+
+
+def export_note(note_data: dict, resources: dict[str, tuple[str, str]], dest_dir: str) -> str:
+    for rhash, (path, name) in resources.items():
+        d = os.path.join(dest_dir, name)
+        shutil.copy2(path, d)
+    root = parse_html(note_data['doc'])
+    for img in root.xpath('//img[@src]'):
+        try:
+            purl = urlparse(img.get('src'))
+        except Exception:
+            continue
+        if purl.scheme == RESOURCE_URL_SCHEME:
+            rhash = f'{purl.hostname}:{purl.path[1:]}'
+            x = resources.get(rhash)
+            if x is not None:
+                img.set('src', x[1])
+
+    shtml = html.tostring(root, encoding='utf-8')
+    with open(os.path.join(dest_dir, DOC_NAME), 'wb') as f:
+        f.write(shtml)
+    os.utime(f.name, times=(note_data['ctime'], note_data['mtime']))
+    return DOC_NAME
+
+
+def import_note(path_to_html_file: str, add_resource) -> dict:
+    path_to_html_file = path_to_html_file
+    with open(make_long_path_useable(path_to_html_file), 'rb') as f:
+        raw = f.read()
+    shtml = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
+    basedir = os.path.dirname(os.path.abspath(path_to_html_file))
+    basedir = os.path.normcase(get_long_path_name(basedir) + os.sep)
+    root = parse_html(shtml)
+    resources = set()
+    for img in root.xpath('//img[@src]'):
+        src = img.attrib.pop('src')
+        img.set('pre-import-src', src)
+        try:
+            purl = urlparse(img.get('src'))
+        except Exception:
+            continue
+        if purl.scheme in ('', 'file'):
+            path = purl.path
+            if not os.path.isabs(path):
+                path = os.path.join(basedir, path)
+            q = os.path.normcase(get_long_path_name(os.path.abspath(path)))
+            if q.startswith(basedir):
+                rhash = add_resource(make_long_path_useable(path), os.path.basename(path))
+                scheme, digest = rhash.split(':', 1)
+                img.set('src', f'{RESOURCE_URL_SCHEME}://{scheme}/{digest}')
+                resources.add(rhash)
+    shtml = html.tostring(root, encoding='unicode')
+    return shtml, html2text(shtml, default_image_alt=' '), resources
--- a/src/calibre/utils/html2text.py
+++ b/src/calibre/utils/html2text.py
@ -5,7 +5,7 @@
 from calibre.utils.localization import _


-def html2text(html, single_line_break=True):
+def html2text(html, single_line_break=True, default_image_alt=''):
    from html2text import HTML2Text
    import re
    if isinstance(html, bytes):
@ -16,7 +16,7 @@ def html2text(html, single_line_break=True):
            r'<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>',
            r'<\g<solidus>span\g<rest>>', html)
    h2t = HTML2Text()
-    h2t.default_image_alt = _('Unnamed image')
+    h2t.default_image_alt = default_image_alt or _('Unnamed image')
    h2t.body_width = 0
    h2t.single_line_break = single_line_break
    h2t.emphasis_mark = '*'