mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
CHM Input: Fix handling of relative file paths in <img> tags. Fixes #7159 (Large CHM file fails conversion)
This commit is contained in:
parent
ca4953f028
commit
30b3d2a564
@ -93,6 +93,7 @@ class CHMReader(CHMFile):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def ExtractFiles(self, output_dir=os.getcwdu()):
|
def ExtractFiles(self, output_dir=os.getcwdu()):
|
||||||
|
html_files = set([])
|
||||||
for path in self.Contents():
|
for path in self.Contents():
|
||||||
lpath = os.path.join(output_dir, path)
|
lpath = os.path.join(output_dir, path)
|
||||||
self._ensure_dir(lpath)
|
self._ensure_dir(lpath)
|
||||||
@ -106,14 +107,27 @@ class CHMReader(CHMFile):
|
|||||||
lpath = lpath.split(';')[0]
|
lpath = lpath.split(';')[0]
|
||||||
try:
|
try:
|
||||||
with open(lpath, 'wb') as f:
|
with open(lpath, 'wb') as f:
|
||||||
if guess_mimetype(path)[0] == ('text/html'):
|
|
||||||
data = self._reformat(data)
|
|
||||||
f.write(data)
|
f.write(data)
|
||||||
|
try:
|
||||||
|
if 'html' in guess_mimetype(path)[0]:
|
||||||
|
html_files.add(lpath)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
except:
|
except:
|
||||||
if iswindows and len(lpath) > 250:
|
if iswindows and len(lpath) > 250:
|
||||||
self.log.warn('%r filename too long, skipping'%path)
|
self.log.warn('%r filename too long, skipping'%path)
|
||||||
continue
|
continue
|
||||||
raise
|
raise
|
||||||
|
for lpath in html_files:
|
||||||
|
with open(lpath, 'r+b') as f:
|
||||||
|
data = f.read()
|
||||||
|
data = self._reformat(data, lpath)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
self._extracted = True
|
self._extracted = True
|
||||||
files = [x for x in os.listdir(output_dir) if
|
files = [x for x in os.listdir(output_dir) if
|
||||||
os.path.isfile(os.path.join(output_dir, x))]
|
os.path.isfile(os.path.join(output_dir, x))]
|
||||||
@ -125,7 +139,7 @@ class CHMReader(CHMFile):
|
|||||||
if self.hhc_path not in files and files:
|
if self.hhc_path not in files and files:
|
||||||
self.hhc_path = files[0]
|
self.hhc_path = files[0]
|
||||||
|
|
||||||
def _reformat(self, data):
|
def _reformat(self, data, htmlpath):
|
||||||
try:
|
try:
|
||||||
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
@ -169,15 +183,19 @@ class CHMReader(CHMFile):
|
|||||||
br[0].extract()
|
br[0].extract()
|
||||||
|
|
||||||
# some images seem to be broken in some chm's :/
|
# some images seem to be broken in some chm's :/
|
||||||
for img in soup('img'):
|
base = os.path.dirname(htmlpath)
|
||||||
try:
|
for img in soup('img', src=True):
|
||||||
# some are supposedly "relative"... lies.
|
src = img['src']
|
||||||
while img['src'].startswith('../'): img['src'] = img['src'][3:]
|
ipath = os.path.join(base, *src.split('/'))
|
||||||
# some have ";<junk>" at the end.
|
if os.path.exists(ipath):
|
||||||
img['src'] = img['src'].split(';')[0]
|
continue
|
||||||
except KeyError:
|
src = src.split(';')[0]
|
||||||
# and some don't even have a src= ?!
|
if not src: continue
|
||||||
pass
|
ipath = os.path.join(base, *src.split('/'))
|
||||||
|
if not os.path.exists(ipath):
|
||||||
|
while src.startswith('../'):
|
||||||
|
src = src[3:]
|
||||||
|
img['src'] = src
|
||||||
try:
|
try:
|
||||||
# if there is only a single table with a single element
|
# if there is only a single table with a single element
|
||||||
# in the body, replace it by the contents of this single element
|
# in the body, replace it by the contents of this single element
|
||||||
|
Loading…
x
Reference in New Issue
Block a user