mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTML Input: Fix UTF-16/32 encoded files that are linked to from the parent file not being properly processed. Fixes #1268262 [convert html file encoded as utf-16 fail to include image files and have wrong order](https://bugs.launchpad.net/calibre/+bug/1268262)
This commit is contained in:
parent
25f96bd198
commit
a60a80d125
@ -20,6 +20,7 @@ from calibre.constants import iswindows
|
|||||||
from calibre import unicode_path, as_unicode, replace_entities
|
from calibre import unicode_path, as_unicode, replace_entities
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Represents a link in a HTML file.
|
Represents a link in a HTML file.
|
||||||
'''
|
'''
|
||||||
@ -73,6 +74,7 @@ class IgnoreFile(Exception):
|
|||||||
self.errno = errno
|
self.errno = errno
|
||||||
|
|
||||||
class HTMLFile(object):
|
class HTMLFile(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Contains basic information about an HTML file. This
|
Contains basic information about an HTML file. This
|
||||||
includes a list of links to other files as well as
|
includes a list of links to other files as well as
|
||||||
@ -103,8 +105,14 @@ class HTMLFile(object):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.path, 'rb') as f:
|
with open(self.path, 'rb') as f:
|
||||||
src = f.read(4096)
|
src = header = f.read(4096)
|
||||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
|
encoding = detect_xml_encoding(src)[1]
|
||||||
|
if encoding:
|
||||||
|
try:
|
||||||
|
header = header.decode(encoding)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||||
if not self.is_binary:
|
if not self.is_binary:
|
||||||
src += f.read()
|
src += f.read()
|
||||||
except IOError as err:
|
except IOError as err:
|
||||||
@ -139,7 +147,6 @@ class HTMLFile(object):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str(self)
|
return str(self)
|
||||||
|
|
||||||
|
|
||||||
def find_links(self, src):
|
def find_links(self, src):
|
||||||
for match in self.LINK_PAT.finditer(src):
|
for match in self.LINK_PAT.finditer(src):
|
||||||
url = None
|
url = None
|
||||||
@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
|
|||||||
log.info('Building file list...')
|
log.info('Building file list...')
|
||||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
verbose=opts.verbose,
|
verbose=opts.verbose,
|
||||||
encoding=opts.input_encoding)\
|
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||||
[0 if opts.breadth_first else 1]
|
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
log.debug('\tFound files...')
|
log.debug('\tFound files...')
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user