mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTML Input: Fix UTF-16/32 encoded files that are linked to from the parent file not being properly processed. Fixes #1268262 [convert html file encoded as utf-16 fail to include image files and have wrong order](https://bugs.launchpad.net/calibre/+bug/1268262)
This commit is contained in:
parent
25f96bd198
commit
a60a80d125
@ -20,6 +20,7 @@ from calibre.constants import iswindows
|
||||
from calibre import unicode_path, as_unicode, replace_entities
|
||||
|
||||
class Link(object):
|
||||
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
@ -73,6 +74,7 @@ class IgnoreFile(Exception):
|
||||
self.errno = errno
|
||||
|
||||
class HTMLFile(object):
|
||||
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
@ -103,8 +105,14 @@ class HTMLFile(object):
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read(4096)
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
|
||||
src = header = f.read(4096)
|
||||
encoding = detect_xml_encoding(src)[1]
|
||||
if encoding:
|
||||
try:
|
||||
header = header.decode(encoding)
|
||||
except ValueError:
|
||||
pass
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||
if not self.is_binary:
|
||||
src += f.read()
|
||||
except IOError as err:
|
||||
@ -139,7 +147,6 @@ class HTMLFile(object):
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
|
Loading…
x
Reference in New Issue
Block a user