Viewer: Spec compliant encoding detection for EPUB

E-book viewer: When viewing EPUB files, ignore any encoding declarations
in the HTML and assume that the HTML is encoded in UTF-8 unless
decoding with UTF-8 fails. Fixes #1188843 [importing epub on mac mixes control characters in with typography](https://bugs.launchpad.net/calibre/+bug/1188843)
This commit is contained in:
Kovid Goyal 2013-06-08 07:45:43 +05:30
parent f5c9ad8661
commit 0339c6396e
2 changed files with 24 additions and 6 deletions

View File

@ -125,7 +125,7 @@ class EbookIterator(BookmarksMixin):
[i for i in self.opf.spine if not i.is_linear]
self.spine = []
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
run_char_count=run_char_count)
run_char_count=run_char_count, from_epub=self.book_format == 'EPUB')
is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}
for i in ordered:
spath = i.path

View File

@ -36,13 +36,29 @@ def anchor_map(html):
class SpineItem(unicode):
def __new__(cls, path, mime_type=None, read_anchor_map=True,
run_char_count=True):
run_char_count=True, from_epub=False):
ppath = path.partition('#')[0]
if not os.path.exists(path) and os.path.exists(ppath):
path = ppath
obj = super(SpineItem, cls).__new__(cls, path)
with open(path, 'rb') as f:
raw = f.read()
if from_epub:
# According to the spec, HTML in EPUB must be encoded in utf-8 or
# utf-16. Furthermore, there exist epub files produced by the usual
# incompetents that have utf-8 encoded HTML files that contain
# incorrect encoding declarations. See
# http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
# http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
# https://bugs.launchpad.net/bugs/1188843
# So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
# is the same algorithm as that used by the conversion pipeline (modulo
# some BOM based detection). Sigh.
try:
raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
raw, obj.encoding = xml_to_unicode(raw)
else:
raw, obj.encoding = xml_to_unicode(raw)
obj.character_count = character_count(raw) if run_char_count else 10000
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
@ -100,14 +116,15 @@ class IndexEntry(object):
self.end_anchor = None
def create_indexing_data(spine, toc):
if not toc: return
if not toc:
return
f = partial(IndexEntry, spine)
index_entries = list(map(f,
(t for t in toc.flat() if t is not toc),
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
))
index_entries.sort(key=attrgetter('sort_key'))
[ i.find_end(index_entries) for i in index_entries ]
[i.find_end(index_entries) for i in index_entries]
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
@ -119,3 +136,4 @@ def create_indexing_data(spine, toc):
end = i.end_anchor if i.spine_pos == spine_pos else None
spine_item.index_entries.append(ie(i, start, end))