diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 9683837ad6..df6793b107 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -20,6 +20,7 @@ from calibre.constants import iswindows
from calibre import unicode_path, as_unicode, replace_entities
class Link(object):
+
'''
Represents a link in a HTML file.
'''
@@ -73,6 +74,7 @@ class IgnoreFile(Exception):
self.errno = errno
class HTMLFile(object):
+
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
@@ -103,8 +105,14 @@ class HTMLFile(object):
try:
with open(self.path, 'rb') as f:
- src = f.read(4096)
- self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
+ src = header = f.read(4096)
+ encoding = detect_xml_encoding(src)[1]
+ if encoding:
+ try:
+ header = header.decode(encoding)
+ except ValueError:
+ pass
+ self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
if not self.is_binary:
src += f.read()
except IOError as err:
@@ -139,7 +147,6 @@ class HTMLFile(object):
def __repr__(self):
return str(self)
-
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
@@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])):
if link.path is not None and link not in visited:
try:
index = flat.index(link)
- except ValueError: # Can happen if max_levels is used
+ except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
@@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
- encoding=opts.input_encoding)\
- [0 if opts.breadth_first else 1]
+ encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')
for f in filelist: