diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 28d7f1e7c7..3e5c0d24d9 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -100,7 +100,7 @@ class CHMReader(CHMFile): def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): html_files = set([]) try: - x = self.GetEncoding() + x = self.get_encoding() codecs.lookup(x) enc = x except: diff --git a/src/calibre/ebooks/conversion/plugins/chm_input.py b/src/calibre/ebooks/conversion/plugins/chm_input.py index 05f7a32aa4..a846682432 100644 --- a/src/calibre/ebooks/conversion/plugins/chm_input.py +++ b/src/calibre/ebooks/conversion/plugins/chm_input.py @@ -7,8 +7,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ptempfile import TemporaryDirectory -from calibre.utils.localization import get_lang -from calibre.utils.filenames import ascii_filename from calibre.constants import filesystem_encoding class CHMInput(InputFormatPlugin): @@ -57,6 +55,7 @@ class CHMInput(InputFormatPlugin): mainpath = os.path.join(tdir, mainname) metadata = get_metadata_from_reader(self._chm_reader) + encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252' self._chm_reader.CloseCHM() # print tdir, mainpath # from calibre import ipython @@ -64,15 +63,31 @@ class CHMInput(InputFormatPlugin): options.debug_pipeline = None options.input_encoding = 'utf-8' - # try a custom conversion: - #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata) - # try using html converter: - htmlpath = self._create_html_root(mainpath, log) + htmlpath, toc = self._create_html_root(mainpath, log, encoding) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi - #log.debug('DEBUG: Not removing tempdir %s' % tdir) + if toc.count() > 1: + oeb.toc = self.parse_html_toc(oeb.spine[0]) + oeb.manifest.remove(oeb.spine[0]) + oeb.auto_generated_toc = False return oeb + def parse_html_toc(self, item): + from calibre.ebooks.oeb.base import TOC, XPath + dx = XPath('./h:div') + ax = XPath('./h:a[1]') + + def do_node(parent, div): + for child in dx(div): + a = ax(child)[0] + c = parent.add(a.text, a.attrib['href']) + do_node(c, child) + + toc = TOC() + root = XPath('//h:div[1]')(item.data)[0] + do_node(toc, root) + return toc + def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): # use HTMLInput plugin to generate book from calibre.customize.builtins import HTMLInput @@ -81,78 +96,22 @@ class CHMInput(InputFormatPlugin): oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) return oeb - - def _create_oebbook(self, hhcpath, basedir, opts, log, mi): - import uuid - from lxml import html - from calibre.ebooks.conversion.plumber import create_oebbook - from calibre.ebooks.oeb.base import DirContainer - oeb = create_oebbook(log, None, opts, - encoding=opts.input_encoding, populate=False) - self.oeb = oeb - - metadata = oeb.metadata - if mi.title: - metadata.add('title', mi.title) - if mi.authors: - for a in mi.authors: - metadata.add('creator', a, attrib={'role':'aut'}) - if mi.publisher: - metadata.add('publisher', mi.publisher) - if mi.isbn: - metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'}) - if not metadata.language: - oeb.logger.warn(u'Language not specified') - metadata.add('language', get_lang().replace('_', '-')) - if not metadata.creator: - oeb.logger.warn('Creator not specified') - metadata.add('creator', _('Unknown')) - if not metadata.title: - oeb.logger.warn('Title not specified') - metadata.add('title', _('Unknown')) - - bookid = str(uuid.uuid4()) - metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.oeb.uid = metadata.identifier[0] - break - - hhcdata = self._read_file(hhcpath) - hhcroot = html.fromstring(hhcdata) - chapters = self._process_nodes(hhcroot) - #print "=============================" - #print "Printing hhcroot" - #print etree.tostring(hhcroot, pretty_print=True) - #print "=============================" - log.debug('Found %d section nodes' % len(chapters)) - - if len(chapters) > 0: - path0 = chapters[0][1] - subpath = os.path.dirname(path0) - htmlpath = os.path.join(basedir, subpath) - - oeb.container = DirContainer(htmlpath, log) - for chapter in chapters: - title = chapter[0] - basename = os.path.basename(chapter[1]) - self._add_item(oeb, title, basename) - - oeb.container = DirContainer(htmlpath, oeb.log) - return oeb - - def _create_html_root(self, hhcpath, log): + def _create_html_root(self, hhcpath, log, encoding): from lxml import html from urllib import unquote as _unquote from calibre.ebooks.oeb.base import urlquote + from calibre.ebooks.chardet import xml_to_unicode hhcdata = self._read_file(hhcpath) + hhcdata = hhcdata.decode(encoding) + hhcdata = xml_to_unicode(hhcdata, verbose=True, + strip_encoding_pats=True, resolve_entities=True)[0] hhcroot = html.fromstring(hhcdata) - chapters = self._process_nodes(hhcroot) + toc = self._process_nodes(hhcroot) #print "=============================" #print "Printing hhcroot" #print etree.tostring(hhcroot, pretty_print=True) #print "=============================" - log.debug('Found %d section nodes' % len(chapters)) + log.debug('Found %d section nodes' % toc.count()) htmlpath = os.path.splitext(hhcpath)[0] + ".html" base = os.path.dirname(os.path.abspath(htmlpath)) @@ -168,37 +127,40 @@ class CHMInput(InputFormatPlugin): x = y return x + def donode(item, parent, base, subpath): + for child in item: + title = child.title + if not title: continue + raw = unquote_path(child.href or '') + rsrcname = os.path.basename(raw) + rsrcpath = os.path.join(subpath, rsrcname) + if (not os.path.exists(os.path.join(base, rsrcpath)) and + os.path.exists(os.path.join(base, raw))): + rsrcpath = raw + + if '%' not in rsrcpath: + rsrcpath = urlquote(rsrcpath) + if not raw: + rsrcpath = '' + c = DIV(A(title, href=rsrcpath)) + donode(child, c, base, subpath) + parent.append(c) + with open(htmlpath, 'wb') as f: - if chapters: - f.write('
\n') - path0 = chapters[0][1] + if toc.count() > 1: + from lxml.html.builder import HTML, BODY, DIV, A + path0 = toc[0].href path0 = unquote_path(path0) subpath = os.path.dirname(path0) base = os.path.dirname(f.name) - - for chapter in chapters: - title = chapter[0] - raw = unquote_path(chapter[1]) - rsrcname = os.path.basename(raw) - rsrcpath = os.path.join(subpath, rsrcname) - if (not os.path.exists(os.path.join(base, rsrcpath)) and - os.path.exists(os.path.join(base, raw))): - rsrcpath = raw - - # title should already be url encoded - if '%' not in rsrcpath: - rsrcpath = urlquote(rsrcpath) - url = "