CHM Input: Support hierarchical table of contents. Do not generate an inline table of contents when a metadata table of contents is present. Also correctly decode the text in the table of contents

2025-07-09 03:04:10 -04:00 · 2013-01-18 19:33:45 +05:30 · 2013-01-18 19:33:45 +05:30 · 1f2daebce6
commit 1f2daebce6
parent 8134629e59
3 changed files with 93 additions and 131 deletions
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -100,7 +100,7 @@ class CHMReader(CHMFile):
    def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
        html_files = set([])
        try:
-            x = self.GetEncoding()
+            x = self.get_encoding()
            codecs.lookup(x)
            enc = x
        except:
--- a/src/calibre/ebooks/conversion/plugins/chm_input.py
+++ b/src/calibre/ebooks/conversion/plugins/chm_input.py
@ -7,8 +7,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 from calibre.constants import filesystem_encoding
 class CHMInput(InputFormatPlugin):
@ -57,6 +55,7 @@ class CHMInput(InputFormatPlugin):
            mainpath = os.path.join(tdir, mainname)
            metadata = get_metadata_from_reader(self._chm_reader)
            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
            self._chm_reader.CloseCHM()
            # print tdir, mainpath
            # from calibre import ipython
@ -64,15 +63,31 @@ class CHMInput(InputFormatPlugin):
            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
-            # try a custom conversion:
+            htmlpath, toc = self._create_html_root(mainpath, log, encoding)
            #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
            # try using html converter:
            htmlpath = self._create_html_root(mainpath, log)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
            options.debug_pipeline = odi
-            #log.debug('DEBUG: Not removing tempdir %s' % tdir)
+            if toc.count() > 1:
                oeb.toc = self.parse_html_toc(oeb.spine[0])
                oeb.manifest.remove(oeb.spine[0])
                oeb.auto_generated_toc = False
        return oeb
    def parse_html_toc(self, item):
        from calibre.ebooks.oeb.base import TOC, XPath
        dx = XPath('./h:div')
        ax = XPath('./h:a[1]')
        def do_node(parent, div):
            for child in dx(div):
                a = ax(child)[0]
                c = parent.add(a.text, a.attrib['href'])
                do_node(c, child)
        toc = TOC()
        root = XPath('//h:div[1]')(item.data)[0]
        do_node(toc, root)
        return toc
    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
        # use HTMLInput plugin to generate book
        from calibre.customize.builtins import HTMLInput
@ -81,78 +96,22 @@ class CHMInput(InputFormatPlugin):
        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
        return oeb
-
+    def _create_html_root(self, hhcpath, log, encoding):
    def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
        import uuid
        from lxml import html
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import DirContainer
        oeb = create_oebbook(log, None, opts,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb
        metadata = oeb.metadata
        if mi.title:
            metadata.add('title', mi.title)
        if mi.authors:
            for a in mi.authors:
                metadata.add('creator', a, attrib={'role':'aut'})
        if mi.publisher:
            metadata.add('publisher', mi.publisher)
        if mi.isbn:
            metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
        if not metadata.language:
            oeb.logger.warn(u'Language not specified')
            metadata.add('language', get_lang().replace('_', '-'))
        if not metadata.creator:
            oeb.logger.warn('Creator not specified')
            metadata.add('creator', _('Unknown'))
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', _('Unknown'))
        bookid = str(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break
        hhcdata = self._read_file(hhcpath)
        hhcroot = html.fromstring(hhcdata)
        chapters = self._process_nodes(hhcroot)
        #print "============================="
        #print "Printing hhcroot"
        #print etree.tostring(hhcroot, pretty_print=True)
        #print "============================="
        log.debug('Found %d section nodes' % len(chapters))
        if len(chapters) > 0:
            path0 = chapters[0][1]
            subpath = os.path.dirname(path0)
            htmlpath = os.path.join(basedir, subpath)
            oeb.container = DirContainer(htmlpath, log)
            for chapter in chapters:
                title = chapter[0]
                basename = os.path.basename(chapter[1])
                self._add_item(oeb, title, basename)
            oeb.container = DirContainer(htmlpath, oeb.log)
        return oeb
    def _create_html_root(self, hhcpath, log):
        from lxml import html
        from urllib import unquote as _unquote
        from calibre.ebooks.oeb.base import urlquote
        from calibre.ebooks.chardet import xml_to_unicode
        hhcdata = self._read_file(hhcpath)
        hhcdata = hhcdata.decode(encoding)
        hhcdata = xml_to_unicode(hhcdata, verbose=True,
                            strip_encoding_pats=True, resolve_entities=True)[0]
        hhcroot = html.fromstring(hhcdata)
-        chapters = self._process_nodes(hhcroot)
+        toc = self._process_nodes(hhcroot)
        #print "============================="
        #print "Printing hhcroot"
        #print etree.tostring(hhcroot, pretty_print=True)
        #print "============================="
-        log.debug('Found %d section nodes' % len(chapters))
+        log.debug('Found %d section nodes' % toc.count())
        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
        base = os.path.dirname(os.path.abspath(htmlpath))
@ -168,37 +127,40 @@ class CHMInput(InputFormatPlugin):
                x = y
            return x
-        with open(htmlpath, 'wb') as f:
+        def donode(item, parent, base, subpath):
-            if chapters:
+            for child in item:
-                f.write('<html><head><meta http-equiv="Content-type"'
+                title = child.title
-                    ' content="text/html;charset=UTF-8" /></head><body>\n')
+                if not title: continue
-                path0 = chapters[0][1]
+                raw = unquote_path(child.href or '')
                path0 = unquote_path(path0)
                subpath = os.path.dirname(path0)
                base = os.path.dirname(f.name)
                for chapter in chapters:
                    title = chapter[0]
                    raw = unquote_path(chapter[1])
                rsrcname = os.path.basename(raw)
                rsrcpath = os.path.join(subpath, rsrcname)
                if (not os.path.exists(os.path.join(base, rsrcpath)) and
                        os.path.exists(os.path.join(base, raw))):
                    rsrcpath = raw
                    # title should already be url encoded
                if '%' not in rsrcpath:
                    rsrcpath = urlquote(rsrcpath)
-                    url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n"
+                if not raw:
-                    if isinstance(url, unicode):
+                    rsrcpath = ''
-                        url = url.encode('utf-8')
+                c = DIV(A(title, href=rsrcpath))
-                    f.write(url)
+                donode(child, c, base, subpath)
                parent.append(c)
-                f.write("</body></html>")
+        with open(htmlpath, 'wb') as f:
            if toc.count() > 1:
                from lxml.html.builder import HTML, BODY, DIV, A
                path0 = toc[0].href
                path0 = unquote_path(path0)
                subpath = os.path.dirname(path0)
                base = os.path.dirname(f.name)
                root = DIV()
                donode(toc, root, base, subpath)
                raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
                                   pretty_print=True)
                f.write(raw)
            else:
                f.write(hhcdata)
-        return htmlpath
+        return htmlpath, toc
    def _read_file(self, name):
        f = open(name, 'rb')
@ -206,41 +168,27 @@ class CHMInput(InputFormatPlugin):
        f.close()
        return data
-    def _visit_node(self, node, chapters, depth):
+    def add_node(self, node, toc, ancestor_map):
        # check that node is a normal node (not a comment, DOCTYPE, etc.)
        # (normal nodes have string tags)
        if isinstance(node.tag, basestring):
        from calibre.ebooks.chm.reader import match_string
-
+        if match_string(node.attrib['type'], 'text/sitemap'):
-            chapter_path = None
+            p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
-            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
+            parent = p[0] if p else None
-                chapter_title = None
+            toc = ancestor_map.get(parent, toc)
-                for child in node:
+            title = href = u''
-                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
+            for param in node.xpath('./param'):
-                        chapter_title = child.attrib['value']
+                if match_string(param.attrib['name'], 'name'):
-                    if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
+                    title = param.attrib['value']
-                        chapter_path = child.attrib['value']
+                elif match_string(param.attrib['name'], 'local'):
-                if chapter_title is not None and chapter_path is not None:
+                    href = param.attrib['value']
-                    chapter = [chapter_title, chapter_path, depth]
+            child = toc.add(title or _('Unknown'), href)
-                    chapters.append(chapter)
+            ancestor_map[node] = child
            if node.tag=="UL":
                depth = depth + 1
            if node.tag=="/UL":
                depth = depth - 1
    def _process_nodes(self, root):
-        chapters = []
+        from calibre.ebooks.oeb.base import TOC
-        depth = 0
+        toc = TOC()
-        for node in root.iter():
+        ancestor_map = {}
-            self._visit_node(node, chapters, depth)
+        for node in root.xpath('//object'):
-        return chapters
+            self.add_node(node, toc, ancestor_map)
        return toc
    def _add_item(self, oeb, title, path):
        bname = os.path.basename(path)
        id, href = oeb.manifest.generate(id='html',
                href=ascii_filename(bname))
        item = oeb.manifest.add(id, href, 'text/html')
        item.html_input_href = bname
        oeb.spine.add(item, True)
        oeb.toc.add(title, item.href)
--- a/src/calibre/utils/chm/chm.py
+++ b/src/calibre/utils/chm/chm.py
@ -28,6 +28,7 @@
 import array
 import string
 import sys
 import codecs
 import calibre.utils.chm.chmlib as chmlib
 from calibre.constants import plugins
@ -184,7 +185,7 @@ locale_table = {
    0x0420 : ('iso8859_6', "Urdu", "Arabic"),
    0x0443 : ('iso8859_9', "Uzbek_Latin", "Turkish"),
    0x0843 : ('cp1251',    "Uzbek_Cyrillic", "Cyrillic"),
-    0x042a : (None,        "Vietnamese", "Vietnamese")
+    0x042a : ('cp1258',        "Vietnamese", "Vietnamese")
 }
 class CHMFile:
@ -434,6 +435,19 @@ class CHMFile:
        else:
            return None
    def get_encoding(self):
        ans = self.GetEncoding()
        if ans is None:
            lcid = self.GetLCID()
            if lcid is not None:
                ans = lcid[0]
        if ans:
            try:
                codecs.lookup(ans)
            except:
                ans = None
        return ans
    def GetDWORD(self, buff, idx=0):
        '''Internal method.
        Reads a double word (4 bytes) from a buffer.