diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index bb15f97407..37bc27f7cf 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -1029,6 +1029,13 @@ class DB: def restore_notes(self, report_progress): self.notes.restore(self.conn, self.tables, report_progress) + def import_note(self, field, item_id, html_file_path): + id_val = self.tables[field].id_map[item_id] + return self.notes.import_note(self.conn, field, item_id, id_val, html_file_path) + + def export_note(self, field, item_id, dest_dir): + return self.notes.export_note(self.conn, field, item_id, dest_dir) + def initialize_fts(self, dbref): self.fts = None if not self.prefs['fts_enabled']: diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index d9733c75ff..159a0ff38c 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -705,6 +705,14 @@ class Cache: def unretire_note_for(self, field, item_id) -> int: return self.backend.unretire_note_for(field, item_id) + @read_api + def export_note(self, field, item_id, dest_dir): + return self.backend.export_note(field, item_id, dest_dir) + + @write_api + def import_note(self, field, item_id, path_to_html_file): + return self.backend.import_note(field, item_id, path_to_html_file) + @write_api # we need to use write locking as SQLITE gives a locked table error if multiple FTS queries are made at the same time def notes_search( self, diff --git a/src/calibre/db/notes/connect.py b/src/calibre/db/notes/connect.py index 2c6df281c5..3f3820b87f 100644 --- a/src/calibre/db/notes/connect.py +++ b/src/calibre/db/notes/connect.py @@ -466,3 +466,27 @@ class Notes: errors.append(_('Could not restore item: {} as not present in database').format(f'{field}/{item_id}')) report_progress('', i) return errors + + def export_note(self, conn, field_name, item_id, dest_dir): + nd = self.get_note_data(conn, field_name, item_id) + if nd is None: + return '' + from .exim import export_note + resources = {} + for rh in nd['resource_hashes']: + p = make_long_path_useable(self.path_for_resource(rh)) + if os.path.exists(p): + for (name,) in conn.execute('SELECT name FROM notes_db.resources WHERE hash=?', (rh,)): + resources[rh] = (p, name) + return export_note(nd, resources, dest_dir) + + def import_note(self, conn, field_name, item_id, item_value, html_file_path): + from .exim import import_note + def add_resource(path, name): + return self.add_resource(conn, path, name) + st = os.stat(html_file_path) + doc, searchable_text, resources = import_note(html_file_path, add_resource) + return self.set_note( + conn, field_name, item_id, item_value, used_resource_hashes=resources, searchable_text=searchable_text, + ctime=st.st_ctime, mtime=st.st_mtime + ) diff --git a/src/calibre/db/notes/exim.py b/src/calibre/db/notes/exim.py new file mode 100644 index 0000000000..f912eb548f --- /dev/null +++ b/src/calibre/db/notes/exim.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# License: GPLv3 Copyright: 2023, Kovid Goyal + +import os +import shutil +from html5_parser import parse +from lxml import html +from urllib.parse import urlparse + +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.cleantext import clean_xml_chars +from calibre.utils.filenames import get_long_path_name, make_long_path_useable +from calibre.utils.html2text import html2text + +from .connect import DOC_NAME, RESOURCE_URL_SCHEME + + +def parse_html(raw): + try: + return parse(raw, maybe_xhtml=False, sanitize_names=True) + except Exception: + return parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True) + + +def export_note(note_data: dict, resources: dict[str, tuple[str, str]], dest_dir: str) -> str: + for rhash, (path, name) in resources.items(): + d = os.path.join(dest_dir, name) + shutil.copy2(path, d) + root = parse_html(note_data['doc']) + for img in root.xpath('//img[@src]'): + try: + purl = urlparse(img.get('src')) + except Exception: + continue + if purl.scheme == RESOURCE_URL_SCHEME: + rhash = f'{purl.hostname}:{purl.path[1:]}' + x = resources.get(rhash) + if x is not None: + img.set('src', x[1]) + + shtml = html.tostring(root, encoding='utf-8') + with open(os.path.join(dest_dir, DOC_NAME), 'wb') as f: + f.write(shtml) + os.utime(f.name, times=(note_data['ctime'], note_data['mtime'])) + return DOC_NAME + + +def import_note(path_to_html_file: str, add_resource) -> dict: + path_to_html_file = path_to_html_file + with open(make_long_path_useable(path_to_html_file), 'rb') as f: + raw = f.read() + shtml = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] + basedir = os.path.dirname(os.path.abspath(path_to_html_file)) + basedir = os.path.normcase(get_long_path_name(basedir) + os.sep) + root = parse_html(shtml) + resources = set() + for img in root.xpath('//img[@src]'): + src = img.attrib.pop('src') + img.set('pre-import-src', src) + try: + purl = urlparse(img.get('src')) + except Exception: + continue + if purl.scheme in ('', 'file'): + path = purl.path + if not os.path.isabs(path): + path = os.path.join(basedir, path) + q = os.path.normcase(get_long_path_name(os.path.abspath(path))) + if q.startswith(basedir): + rhash = add_resource(make_long_path_useable(path), os.path.basename(path)) + scheme, digest = rhash.split(':', 1) + img.set('src', f'{RESOURCE_URL_SCHEME}://{scheme}/{digest}') + resources.add(rhash) + shtml = html.tostring(root, encoding='unicode') + return shtml, html2text(shtml, default_image_alt=' '), resources diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py index 4ce9e0b1ec..4dc0643864 100644 --- a/src/calibre/utils/html2text.py +++ b/src/calibre/utils/html2text.py @@ -5,7 +5,7 @@ from calibre.utils.localization import _ -def html2text(html, single_line_break=True): +def html2text(html, single_line_break=True, default_image_alt=''): from html2text import HTML2Text import re if isinstance(html, bytes): @@ -16,7 +16,7 @@ def html2text(html, single_line_break=True): r'<\s*(?P/?)\s*[uU]\b(?P[^>]*)>', r'<\gspan\g>', html) h2t = HTML2Text() - h2t.default_image_alt = _('Unnamed image') + h2t.default_image_alt = default_image_alt or _('Unnamed image') h2t.body_width = 0 h2t.single_line_break = single_line_break h2t.emphasis_mark = '*'