Conversion pipeline: Remove encoding declaraions from HTML documents to guarantee that there is only a single encoding declaration in the output HTML. Fixes #773337 (html2epub convertion produces double "charset" directive in EPUB)

2025-07-09 03:04:10 -04:00 · 2011-04-29 10:32:27 -06:00 · 2011-04-29 10:32:27 -06:00 · b9098e8520
commit b9098e8520
parent 107912f63f
1 changed files with 2 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -16,7 +16,7 @@ from urllib import unquote as urlunquote
 from lxml import etree, html
 from calibre.constants import filesystem_encoding, __version__
 from calibre.translations.dynamic import translate
-from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.conversion.preprocess import CSSPreProcessor
 from calibre import isbytestring, as_unicode, get_types_map
@ -853,6 +853,7 @@ class Manifest(object):
            self.oeb.log.debug('Parsing', self.href, '...')
            # Convert to Unicode and normalize line endings
            data = self.oeb.decode(data)
            data = strip_encoding_declarations(data)
            data = self.oeb.html_preprocessor(data)
            # There could be null bytes in data if it had &#0; entities in it
            data = data.replace('\0', '')