mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Viewer: Spec compliant encoding detection for EPUB
E-book viewer: When viewing EPUB files, ignore any encoding declarations in the HTML and assume that the HTML is encoded in UTF-8 unless decoding with UTF-8 fails. Fixes #1188843 [importing epub on mac mixes control characters in with typography](https://bugs.launchpad.net/calibre/+bug/1188843)
This commit is contained in:
parent
f5c9ad8661
commit
0339c6396e
@ -125,7 +125,7 @@ class EbookIterator(BookmarksMixin):
|
|||||||
[i for i in self.opf.spine if not i.is_linear]
|
[i for i in self.opf.spine if not i.is_linear]
|
||||||
self.spine = []
|
self.spine = []
|
||||||
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
|
Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
|
||||||
run_char_count=run_char_count)
|
run_char_count=run_char_count, from_epub=self.book_format == 'EPUB')
|
||||||
is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}
|
is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}
|
||||||
for i in ordered:
|
for i in ordered:
|
||||||
spath = i.path
|
spath = i.path
|
||||||
|
@ -36,14 +36,30 @@ def anchor_map(html):
|
|||||||
class SpineItem(unicode):
|
class SpineItem(unicode):
|
||||||
|
|
||||||
def __new__(cls, path, mime_type=None, read_anchor_map=True,
|
def __new__(cls, path, mime_type=None, read_anchor_map=True,
|
||||||
run_char_count=True):
|
run_char_count=True, from_epub=False):
|
||||||
ppath = path.partition('#')[0]
|
ppath = path.partition('#')[0]
|
||||||
if not os.path.exists(path) and os.path.exists(ppath):
|
if not os.path.exists(path) and os.path.exists(ppath):
|
||||||
path = ppath
|
path = ppath
|
||||||
obj = super(SpineItem, cls).__new__(cls, path)
|
obj = super(SpineItem, cls).__new__(cls, path)
|
||||||
with open(path, 'rb') as f:
|
with open(path, 'rb') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
raw, obj.encoding = xml_to_unicode(raw)
|
if from_epub:
|
||||||
|
# According to the spec, HTML in EPUB must be encoded in utf-8 or
|
||||||
|
# utf-16. Furthermore, there exist epub files produced by the usual
|
||||||
|
# incompetents that have utf-8 encoded HTML files that contain
|
||||||
|
# incorrect encoding declarations. See
|
||||||
|
# http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
|
||||||
|
# http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
|
||||||
|
# https://bugs.launchpad.net/bugs/1188843
|
||||||
|
# So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
|
||||||
|
# is the same algorithm as that used by the conversion pipeline (modulo
|
||||||
|
# some BOM based detection). Sigh.
|
||||||
|
try:
|
||||||
|
raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raw, obj.encoding = xml_to_unicode(raw)
|
||||||
|
else:
|
||||||
|
raw, obj.encoding = xml_to_unicode(raw)
|
||||||
obj.character_count = character_count(raw) if run_char_count else 10000
|
obj.character_count = character_count(raw) if run_char_count else 10000
|
||||||
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
|
obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
|
||||||
obj.start_page = -1
|
obj.start_page = -1
|
||||||
@ -100,22 +116,24 @@ class IndexEntry(object):
|
|||||||
self.end_anchor = None
|
self.end_anchor = None
|
||||||
|
|
||||||
def create_indexing_data(spine, toc):
|
def create_indexing_data(spine, toc):
|
||||||
if not toc: return
|
if not toc:
|
||||||
|
return
|
||||||
f = partial(IndexEntry, spine)
|
f = partial(IndexEntry, spine)
|
||||||
index_entries = list(map(f,
|
index_entries = list(map(f,
|
||||||
(t for t in toc.flat() if t is not toc),
|
(t for t in toc.flat() if t is not toc),
|
||||||
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
|
(i-1 for i, t in enumerate(toc.flat()) if t is not toc)
|
||||||
))
|
))
|
||||||
index_entries.sort(key=attrgetter('sort_key'))
|
index_entries.sort(key=attrgetter('sort_key'))
|
||||||
[ i.find_end(index_entries) for i in index_entries ]
|
[i.find_end(index_entries) for i in index_entries]
|
||||||
|
|
||||||
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
|
ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
|
||||||
|
|
||||||
for spine_pos, spine_item in enumerate(spine):
|
for spine_pos, spine_item in enumerate(spine):
|
||||||
for i in index_entries:
|
for i in index_entries:
|
||||||
if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
|
if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
|
||||||
continue # Does not touch this file
|
continue # Does not touch this file
|
||||||
start = i.anchor if i.spine_pos == spine_pos else None
|
start = i.anchor if i.spine_pos == spine_pos else None
|
||||||
end = i.end_anchor if i.spine_pos == spine_pos else None
|
end = i.end_anchor if i.spine_pos == spine_pos else None
|
||||||
spine_item.index_entries.append(ie(i, start, end))
|
spine_item.index_entries.append(ie(i, start, end))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user