HTML Input: Fix UTF-16/32 encoded files that are linked to from the parent file not being properly processed. Fixes #1268262 [convert html file encoded as utf-16 fail to include image files and have wrong order](https://bugs.launchpad.net/calibre/+bug/1268262)

This commit is contained in:
Kovid Goyal 2014-01-12 16:15:21 +05:30
parent 25f96bd198
commit a60a80d125

View File

@ -20,6 +20,7 @@ from calibre.constants import iswindows
from calibre import unicode_path, as_unicode, replace_entities
class Link(object):
'''
Represents a link in a HTML file.
'''
@ -73,6 +74,7 @@ class IgnoreFile(Exception):
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
@ -103,8 +105,14 @@ class HTMLFile(object):
try:
with open(self.path, 'rb') as f:
src = f.read(4096)
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
src = header = f.read(4096)
encoding = detect_xml_encoding(src)[1]
if encoding:
try:
header = header.decode(encoding)
except ValueError:
pass
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
if not self.is_binary:
src += f.read()
except IOError as err:
@ -139,7 +147,6 @@ class HTMLFile(object):
def __repr__(self):
return str(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])):
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
encoding=opts.input_encoding)\
[0 if opts.breadth_first else 1]
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')
for f in filelist: