From a60a80d12526da5d43af4d94fc0af3f7a825c7f7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 12 Jan 2014 16:15:21 +0530
Subject: [PATCH] HTML Input: Fix UTF-16/32 encoded files that are linked to
 from the parent file not being properly processed. Fixes #1268262 [convert
 html file encoded as utf-16 fail to include image files and have wrong
 order](https://bugs.launchpad.net/calibre/+bug/1268262)

---
 src/calibre/ebooks/html/input.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 9683837ad6..df6793b107 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -20,6 +20,7 @@ from calibre.constants import iswindows
 from calibre import unicode_path, as_unicode, replace_entities
 
 class Link(object):
+
     '''
     Represents a link in a HTML file.
     '''
@@ -73,6 +74,7 @@ class IgnoreFile(Exception):
         self.errno = errno
 
 class HTMLFile(object):
+
     '''
     Contains basic information about an HTML file. This
     includes a list of links to other files as well as
@@ -103,8 +105,14 @@ class HTMLFile(object):
 
         try:
             with open(self.path, 'rb') as f:
-                src = f.read(4096)
-                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
+                src = header = f.read(4096)
+                encoding = detect_xml_encoding(src)[1]
+                if encoding:
+                    try:
+                        header = header.decode(encoding)
+                    except ValueError:
+                        pass
+                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
                 if not self.is_binary:
                     src += f.read()
         except IOError as err:
@@ -139,7 +147,6 @@ class HTMLFile(object):
     def __repr__(self):
         return str(self)
 
-
     def find_links(self, src):
         for match in self.LINK_PAT.finditer(src):
             url = None
@@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])):
         if link.path is not None and link not in visited:
             try:
                 index = flat.index(link)
-            except ValueError: # Can happen if max_levels is used
+            except ValueError:  # Can happen if max_levels is used
                 continue
             hf = flat[index]
             if hf not in visited:
@@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
     log.info('Building file list...')
     filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                         verbose=opts.verbose,
-                        encoding=opts.input_encoding)\
-                [0 if opts.breadth_first else 1]
+                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
     if opts.verbose:
         log.debug('\tFound files...')
         for f in filelist: