From c84aa8105b4e28318a36bfb02f7a8529e50a42b8 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Tue, 9 Feb 2010 00:04:36 -0800 Subject: [PATCH] Experiment with using html input plugin to process chm --- src/calibre/ebooks/chm/input.py | 55 ++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index 8bb6f03aa7..c9116bcb5f 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -244,12 +244,24 @@ class CHMInput(InputFormatPlugin): odi = options.debug_pipeline options.debug_pipeline = None # try a custom conversion: - oeb = self._create_oebbook(mainpath, tdir, options, log, metadata) + #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata) + # try using html converter: + htmlpath = self._create_html_root(mainpath, log) + oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi #log.debug('DEBUG: Not removing tempdir %s' % tdir) shutil.rmtree(tdir) return oeb + def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): + # use HTMLInput plugin to generate book + from calibre.ebooks.html.input import HTMLInput + opts.breadth_first = True + htmlinput = HTMLInput(None) + oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) + return oeb + + def _create_oebbook(self, hhcpath, basedir, opts, log, mi): from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import DirContainer, \ @@ -311,13 +323,43 @@ class CHMInput(InputFormatPlugin): oeb.container = DirContainer(htmlpath, oeb.log) return oeb + def _create_html_root(self, hhcpath, log): + hhcdata = self._read_file(hhcpath) + hhcroot = html.fromstring(hhcdata) + chapters = self._process_nodes(hhcroot) + #print "=============================" + #print "Printing hhcroot" + #print etree.tostring(hhcroot, pretty_print=True) + #print "=============================" + log.debug('Found %d section nodes' % len(chapters)) + htmlpath = os.path.splitext(hhcpath)[0] + ".html" + f = open(htmlpath, 'wb') + f.write("
\r\n") + + if chapters: + path0 = chapters[0][1] + subpath = os.path.dirname(path0) + + for chapter in chapters: + title = chapter[0] + rsrcname = os.path.basename(chapter[1]) + rsrcpath = os.path.join(subpath, rsrcname) + # title should already be url encoded + url = "