CHM Input: Handle chm files that contain files with url unsafe filenames. Fixes #1100610 (Error in "Table of contents" when convert CHM to epub)

This commit is contained in:
Kovid Goyal 2013-01-17 17:46:32 +05:30
parent b9581e5905
commit 79823fbf61

View File

@ -58,9 +58,9 @@ class CHMInput(InputFormatPlugin):
metadata = get_metadata_from_reader(self._chm_reader) metadata = get_metadata_from_reader(self._chm_reader)
self._chm_reader.CloseCHM() self._chm_reader.CloseCHM()
#print tdir # print tdir, mainpath
#from calibre import ipython # from calibre import ipython
#ipython() # ipython()
options.debug_pipeline = None options.debug_pipeline = None
options.input_encoding = 'utf-8' options.input_encoding = 'utf-8'
@ -143,6 +143,8 @@ class CHMInput(InputFormatPlugin):
def _create_html_root(self, hhcpath, log): def _create_html_root(self, hhcpath, log):
from lxml import html from lxml import html
from urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
hhcdata = self._read_file(hhcpath) hhcdata = self._read_file(hhcpath)
hhcroot = html.fromstring(hhcdata) hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot) chapters = self._process_nodes(hhcroot)
@ -152,23 +154,41 @@ class CHMInput(InputFormatPlugin):
#print "=============================" #print "============================="
log.debug('Found %d section nodes' % len(chapters)) log.debug('Found %d section nodes' % len(chapters))
htmlpath = os.path.splitext(hhcpath)[0] + ".html" htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath))
def unquote(x):
if isinstance(x, unicode):
x = x.encode('utf-8')
return _unquote(x).decode('utf-8')
def unquote_path(x):
y = unquote(x)
if (not os.path.exists(os.path.join(base, x)) and
os.path.exists(os.path.join(base, y))):
x = y
return x
with open(htmlpath, 'wb') as f: with open(htmlpath, 'wb') as f:
if chapters: if chapters:
f.write('<html><head><meta http-equiv="Content-type"' f.write('<html><head><meta http-equiv="Content-type"'
' content="text/html;charset=UTF-8" /></head><body>\n') ' content="text/html;charset=UTF-8" /></head><body>\n')
path0 = chapters[0][1] path0 = chapters[0][1]
path0 = unquote_path(path0)
subpath = os.path.dirname(path0) subpath = os.path.dirname(path0)
base = os.path.dirname(f.name) base = os.path.dirname(f.name)
for chapter in chapters: for chapter in chapters:
title = chapter[0] title = chapter[0]
rsrcname = os.path.basename(chapter[1]) raw = unquote_path(chapter[1])
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname) rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and if (not os.path.exists(os.path.join(base, rsrcpath)) and
os.path.exists(os.path.join(base, chapter[1]))): os.path.exists(os.path.join(base, raw))):
rsrcpath = chapter[1] rsrcpath = raw
# title should already be url encoded # title should already be url encoded
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n" url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n"
if isinstance(url, unicode): if isinstance(url, unicode):
url = url.encode('utf-8') url = url.encode('utf-8')