From a60a80d12526da5d43af4d94fc0af3f7a825c7f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Jan 2014 16:15:21 +0530 Subject: [PATCH] HTML Input: Fix UTF-16/32 encoded files that are linked to from the parent file not being properly processed. Fixes #1268262 [convert html file encoded as utf-16 fail to include image files and have wrong order](https://bugs.launchpad.net/calibre/+bug/1268262) --- src/calibre/ebooks/html/input.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 9683837ad6..df6793b107 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -20,6 +20,7 @@ from calibre.constants import iswindows from calibre import unicode_path, as_unicode, replace_entities class Link(object): + ''' Represents a link in a HTML file. ''' @@ -73,6 +74,7 @@ class IgnoreFile(Exception): self.errno = errno class HTMLFile(object): + ''' Contains basic information about an HTML file. This includes a list of links to other files as well as @@ -103,8 +105,14 @@ class HTMLFile(object): try: with open(self.path, 'rb') as f: - src = f.read(4096) - self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src)) + src = header = f.read(4096) + encoding = detect_xml_encoding(src)[1] + if encoding: + try: + header = header.decode(encoding) + except ValueError: + pass + self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) if not self.is_binary: src += f.read() except IOError as err: @@ -139,7 +147,6 @@ class HTMLFile(object): def __repr__(self): return str(self) - def find_links(self, src): for match in self.LINK_PAT.finditer(src): url = None @@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])): if link.path is not None and link not in visited: try: index = flat.index(link) - except ValueError: # Can happen if max_levels is used + except ValueError: # Can happen if max_levels is used continue hf = flat[index] if hf not in visited: @@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log): log.info('Building file list...') filelist = traverse(htmlfile, max_levels=int(opts.max_levels), verbose=opts.verbose, - encoding=opts.input_encoding)\ - [0 if opts.breadth_first else 1] + encoding=opts.input_encoding)[0 if opts.breadth_first else 1] if opts.verbose: log.debug('\tFound files...') for f in filelist: