EPUB Output: Remove invalid fragment identifiers from the NCX TOC to keep Adobe Digital Editions from bursting into big wet tears

2025-07-09 03:04:10 -04:00 · 2010-02-14 21:13:47 -07:00 · 2010-02-14 21:13:47 -07:00 · 895bd5db70
commit 895bd5db70
parent baccd08c49
2 changed files with 17 additions and 2 deletions
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -258,6 +258,18 @@ class EPUBOutput(OutputFormatPlugin):
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, OEB_STYLES, barename

+        # ADE cries big wet tears when it encounters an invalid fragment
+        # identifier in the NCX toc.
+        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+')
+        for node in self.oeb.toc.iter():
+            href = getattr(node, 'href', None)
+            if hasattr(href, 'partition'):
+                base, _, frag = href.partition('#')
+                if frag and frag_pat.match(frag) is None:
+                    self.log.warn(
+                            'Removing invalid fragment identifier %r from TOC'%frag)
+                    node.href = base
+
        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1578,14 +1578,17 @@ class TOC(object):
            parent = etree.Element(NCX('navMap'))
        for node in self.nodes:
            id = node.id or unicode(uuid.uuid4())
-            attrib = {'id': id, 'playOrder': str(node.play_order)}
+            po = node.play_order
+            if po == 0:
+                po = 1
+            attrib = {'id': id, 'playOrder': str(po)}
            if node.klass:
                attrib['class'] = node.klass
            point = element(parent, NCX('navPoint'), attrib=attrib)
            label = etree.SubElement(point, NCX('navLabel'))
            title = node.title
            if title:
-                title = re.sub(r'\s', ' ', title)
+                title = re.sub(r'\s+', ' ', title)
            element(label, NCX('text')).text = title
            element(point, NCX('content'), src=urlunquote(node.href))
            node.to_ncx(point)