diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 1a5b117f8f..e43ba27d3f 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -46,6 +46,7 @@ from calibre.ebooks.oeb.base import ( ) from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook +from calibre.ebooks.oeb.polish.parsing import decode_xml from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak from calibre.ebooks.oeb.polish.utils import OEB_FONTS, CommentFinder, PositionFinder, adjust_mime_for_epub, guess_type, insert_self_closing, parse_css from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile @@ -168,36 +169,10 @@ class ContainerBase: # {{{ :param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats. ''' - def fix_data(d): - return d.replace('\r\n', '\n').replace('\r', '\n') - if isinstance(data, str): - return fix_data(data) - bom_enc = None - if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}: - bom_enc = {b'\0\0\xfe\xff':'utf-32-be', - b'\xff\xfe\0\0':'utf-32-le'}[data[:4]] - data = data[4:] - elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}: - bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]] - data = data[2:] - elif data[:3] == b'\xef\xbb\xbf': - bom_enc = 'utf-8' - data = data[3:] - if bom_enc is not None: - try: - self.used_encoding = bom_enc - return fix_data(data.decode(bom_enc)) - except UnicodeDecodeError: - pass - try: - self.used_encoding = 'utf-8' - return fix_data(data.decode('utf-8')) - except UnicodeDecodeError: - pass - data, self.used_encoding = xml_to_unicode(data) - if normalize_to_nfc: - data = unicodedata.normalize('NFC', data) - return fix_data(data) + html, used_encoding = decode_xml(data, normalize_to_nfc) + if used_encoding: + self.used_encoding = used_encoding + return html def parse_xml(self, data): data, self.used_encoding = xml_to_unicode( diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py index 75509c9c31..311d40c996 100644 --- a/src/calibre/ebooks/oeb/polish/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/kepubify.py @@ -18,6 +18,7 @@ from lxml import etree from calibre.ebooks.oeb.base import XHTML, XPath from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies +from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.tts import lang_for_elem from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing from calibre.spell.break_iterator import sentence_positions @@ -176,11 +177,17 @@ def remove_kobo_markup_from_html(root): unwrap_body_contents(body) -def kepubify_html(root, metadata_lang='en'): +def kepubify_parsed_html(root, metadata_lang: str = 'en'): remove_kobo_markup_from_html(root) merge_multiple_html_heads_and_bodies(root) add_kobo_markup_to_html(root, metadata_lang) -def kepubify(container): +def kepubify_html_data(raw: str | bytes, metadata_lang: str = 'en'): + root = parse(raw) + kepubify_parsed_html(root, metadata_lang) + return root + + +def kepubify_container(container): lang = container.mi.language diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 78fead9704..c8ef145bb0 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -5,6 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' import re +import unicodedata import html5_parser from lxml.etree import Element as LxmlElement @@ -23,6 +24,40 @@ except ImportError: XHTML_NS = 'http://www.w3.org/1999/xhtml' +def decode_xml(data: bytes, normalize_to_nfc: bool = True) -> tuple[str, str]: + used_encoding = '' + def fix_data(d): + return d.replace('\r\n', '\n').replace('\r', '\n'), used_encoding + if isinstance(data, str): + return fix_data(data) + bom_enc = None + if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}: + bom_enc = {b'\0\0\xfe\xff':'utf-32-be', + b'\xff\xfe\0\0':'utf-32-le'}[data[:4]] + data = data[4:] + elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}: + bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]] + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + bom_enc = 'utf-8' + data = data[3:] + if bom_enc is not None: + try: + used_encoding = bom_enc + return fix_data(data.decode(bom_enc)) + except UnicodeDecodeError: + pass + try: + used_encoding = 'utf-8' + return fix_data(data.decode('utf-8')) + except UnicodeDecodeError: + pass + data, used_encoding = xml_to_unicode(data) + if normalize_to_nfc: + data = unicodedata.normalize('NFC', data) + return fix_data(data) + + def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py index 29ed7ee11c..65b980ad53 100644 --- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py @@ -3,8 +3,7 @@ from calibre.ebooks.oeb.base import serialize -from calibre.ebooks.oeb.polish.kepubify import kepubify_html -from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse +from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data from calibre.ebooks.oeb.polish.tests.base import BaseTest @@ -38,8 +37,7 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }