AZW3 Input: Handle kindlegen produced azw3 files that do not use normal HTML anchors for linking.

2025-07-09 03:04:10 -04:00 · 2015-04-28 09:53:53 +05:30 · 2015-04-28 09:53:53 +05:30 · e198d427b5
commit e198d427b5
parent c617a95672
2 changed files with 21 additions and 8 deletions
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log):
    # All parts are now unicode and have no internal links
    return parts

-def remove_kindlegen_markup(parts):
+def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):

    # we can safely remove all of the Kindlegen generated aid tags
    find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
            re.IGNORECASE)
-    within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
+    within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''')

    for i in xrange(len(parts)):
        part = parts[i]
@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts):
            tag = srcpieces[j]
            if tag.startswith('<'):
                for m in within_tag_aid_position_pattern.finditer(tag):
+                    try:
+                        aid = m.group(1)
+                    except IndexError:
+                        aid = None
                    replacement = ''
-                    tag = within_tag_aid_position_pattern.sub(replacement, tag,
-                            1)
+                    if aid in linked_aids:
+                        replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
+                    tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
                srcpieces[j] = tag
        part = "".join(srcpieces)
        parts[i] = part
@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
    parts = update_internal_links(mobi8_reader, log)

    # Remove pointless markup inserted by kindlegen
-    remove_kindlegen_markup(parts)
+    remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)

    # Handle substitutions for the flows pieces first as they may
    # be inlined into the xhtml text
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -11,6 +11,7 @@ import struct, re, os
 from collections import namedtuple
 from itertools import repeat, izip
 from urlparse import urldefrag
+from uuid import uuid4

 from lxml import etree

@ -72,9 +73,13 @@ class Mobi8Reader(object):
        self.mobi6_reader, self.log = mobi6_reader, log
        self.header = mobi6_reader.book_header
        self.encrypted_fonts = []
+        self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
+        self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
+        self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')

    def __call__(self):
        self.mobi6_reader.check_for_drm()
+        self.aid_anchor_suffix = bytes(uuid4().hex)
        bh = self.mobi6_reader.book_header
        if self.mobi6_reader.kf8_type == 'joint':
            offset = self.mobi6_reader.kf8_boundary + 2
@ -94,6 +99,7 @@ class Mobi8Reader(object):
        self.kf8_sections = self.mobi6_reader.sections[offset-1:]

        self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
+        self.linked_aids = set()

        self.read_indices()
        self.build_parts()
@ -317,12 +323,14 @@ class Mobi8Reader(object):
        if plt == npos or pgt < plt:
            npos = pgt + 1
        textblock = textblock[0:npos]
-        id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
-        name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
        for tag in reverse_tag_iter(textblock):
-            m = id_re.match(tag) or name_re.match(tag)
+            m = self.id_re.match(tag) or self.name_re.match(tag)
            if m is not None:
                return m.group(1)
+            m = self.aid_re.match(tag)
+            if m is not None:
+                self.linked_aids.add(m.group(1))
+                return m.group(1) + b'-' + self.aid_anchor_suffix

        # No tag found, link to start of file
        return b''