Viewer: Spec compliant encoding detection for EPUB

E-book viewer: When viewing EPUB files, ignore any encoding declarations in the HTML and assume that the HTML is encoded in UTF-8 unless decoding with UTF-8 fails. Fixes #1188843 [importing epub on mac mixes control characters in with typography](https://bugs.launchpad.net/calibre/+bug/1188843)
2025-07-09 03:04:10 -04:00 · 2013-06-08 07:45:43 +05:30 · 2013-06-08 07:45:43 +05:30 · 0339c6396e
commit 0339c6396e
parent f5c9ad8661
2 changed files with 24 additions and 6 deletions
--- a/src/calibre/ebooks/oeb/iterator/book.py
+++ b/src/calibre/ebooks/oeb/iterator/book.py
@ -125,7 +125,7 @@ class EbookIterator(BookmarksMixin):
                  [i for i in self.opf.spine if not i.is_linear]
        self.spine = []
        Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
-                run_char_count=run_char_count)
+                run_char_count=run_char_count, from_epub=self.book_format == 'EPUB')
        is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}
        for i in ordered:
            spath = i.path
--- a/src/calibre/ebooks/oeb/iterator/spine.py
+++ b/src/calibre/ebooks/oeb/iterator/spine.py
@ -36,13 +36,29 @@ def anchor_map(html):
 class SpineItem(unicode):

    def __new__(cls, path, mime_type=None, read_anchor_map=True,
-            run_char_count=True):
+            run_char_count=True, from_epub=False):
        ppath = path.partition('#')[0]
        if not os.path.exists(path) and os.path.exists(ppath):
            path = ppath
        obj = super(SpineItem, cls).__new__(cls, path)
        with open(path, 'rb') as f:
            raw = f.read()
+        if from_epub:
+            # According to the spec, HTML in EPUB must be encoded in utf-8 or
+            # utf-16. Furthermore, there exist epub files produced by the usual
+            # incompetents that have utf-8 encoded HTML files that contain
+            # incorrect encoding declarations. See
+            # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
+            # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
+            # https://bugs.launchpad.net/bugs/1188843
+            # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
+            # is the same algorithm as that used by the conversion pipeline (modulo
+            # some BOM based detection). Sigh.
+            try:
+                raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
+            except UnicodeDecodeError:
+                raw, obj.encoding = xml_to_unicode(raw)
+        else:
            raw, obj.encoding = xml_to_unicode(raw)
        obj.character_count = character_count(raw) if run_char_count else 10000
        obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
@ -100,14 +116,15 @@ class IndexEntry(object):
            self.end_anchor = None

 def create_indexing_data(spine, toc):
-    if not toc: return
+    if not toc:
+        return
    f = partial(IndexEntry, spine)
    index_entries = list(map(f,
        (t for t in toc.flat() if t is not toc),
        (i-1 for i, t in enumerate(toc.flat()) if t is not toc)
        ))
    index_entries.sort(key=attrgetter('sort_key'))
-    [ i.find_end(index_entries) for i in index_entries ]
+    [i.find_end(index_entries) for i in index_entries]

    ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')

@ -119,3 +136,4 @@ def create_indexing_data(spine, toc):
            end = i.end_anchor if i.spine_pos == spine_pos else None
            spine_item.index_entries.append(ie(i, start, end))

+