MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons. Fixes #914036 (MOBI to EPUB conversion bad TOC item pagebreaks)

2025-07-09 03:04:10 -04:00 · 2012-01-12 12:27:15 +05:30 · 2012-01-12 12:27:15 +05:30 · 5d664cf830
commit 5d664cf830
parent 2d9e47db52
1 changed files with 40 additions and 4 deletions
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -244,7 +244,9 @@ class MetadataHeader(BookHeader):


 class MobiReader(object):
-    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
+    PAGE_BREAK_PAT = re.compile(
+        r'<\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
+        re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')

    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
@ -539,6 +541,9 @@ class MobiReader(object):
            x.getparent().remove(x)
        svg_tags = []
        forwardable_anchors = []
+        pagebreak_anchors = []
+        BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                'div', 'p'}
        for i, tag in enumerate(root.iter(etree.Element)):
            tag.attrib.pop('xmlns', '')
            for x in tag.attrib:
@ -657,6 +662,10 @@ class MobiReader(object):
                if not tag.text:
                    tag.tag = 'div'

+            if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
+                    'div' and 'filepos-id' in attrib):
+                pagebreak_anchors.append(tag)
+
            if 'filepos-id' in attrib:
                attrib['id'] = attrib.pop('filepos-id')
                if 'name' in attrib and attrib['name'] != attrib['id']:
@ -670,8 +679,7 @@ class MobiReader(object):
            if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos')
                    and not tag.text and (tag.tail is None or not
                        tag.tail.strip()) and getattr(tag.getnext(), 'tag',
-                            None) in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                                'div', 'p')):
+                            None) in BLOCK_TAGS):
                # This is an empty anchor immediately before a block tag, move
                # the id onto the block tag instead
                forwardable_anchors.append(tag)
@ -704,6 +712,18 @@ class MobiReader(object):
            if hasattr(parent, 'remove'):
                parent.remove(tag)

+        for tag in pagebreak_anchors:
+            anchor = tag.attrib['id']
+            del tag.attrib['id']
+            if 'name' in tag.attrib:
+                del tag.attrib['name']
+            p = tag.getparent()
+            a = p.makeelement('a')
+            a.attrib['id'] = anchor
+            p.insert(p.index(tag)+1, a)
+            if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
+                forwardable_anchors.append(a)
+
        for tag in forwardable_anchors:
            block = tag.getnext()
            tag.getparent().remove(tag)
@ -919,7 +939,7 @@ class MobiReader(object):

    def replace_page_breaks(self):
        self.processed_html = self.PAGE_BREAK_PAT.sub(
-            '<div class="mbp_pagebreak" />',
+            r'<div \1 class="mbp_pagebreak" />',
            self.processed_html)

    def add_anchors(self):
@ -1047,3 +1067,19 @@ def get_metadata(stream):
        im.convert('RGB').save(obuf, format='JPEG')
        mi.cover_data = ('jpg', obuf.getvalue())
    return mi
+
+def test_mbp_regex():
+    for raw, m in {
+        '<mbp:pagebreak></mbp:pagebreak>':'',
+        '<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
+        '<mbp:pagebreak> </mbp:pagebreak>':'',
+        '<mbp:pagebreak>xxx':'xxx',
+        '<mbp:pagebreak/>xxx':'xxx',
+        '<mbp:pagebreak sdf/ >xxx':' sdfxxx',
+        '<mbp:pagebreak / >':' ',
+        }.iteritems():
+        ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
+        if ans != m:
+            raise Exception('%r != %r for %r'%(ans, m, raw))
+
+