diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index 00c847b843..9f68b2ab04 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log): # All parts are now unicode and have no internal links return parts -def remove_kindlegen_markup(parts): +def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids): # we can safely remove all of the Kindlegen generated aid tags find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) - within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''') + within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''') for i in xrange(len(parts)): part = parts[i] @@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts): tag = srcpieces[j] if tag.startswith('<'): for m in within_tag_aid_position_pattern.finditer(tag): + try: + aid = m.group(1) + except IndexError: + aid = None replacement = '' - tag = within_tag_aid_position_pattern.sub(replacement, tag, - 1) + if aid in linked_aids: + replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix) + tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part @@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log): parts = update_internal_links(mobi8_reader, log) # Remove pointless markup inserted by kindlegen - remove_kindlegen_markup(parts) + remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids) # Handle substitutions for the flows pieces first as they may # be inlined into the xhtml text diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index b3696a0abd..d46a036ee9 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -11,6 +11,7 @@ import struct, re, os from collections import namedtuple from itertools import repeat, izip from urlparse import urldefrag +from uuid import uuid4 from lxml import etree @@ -72,9 +73,13 @@ class Mobi8Reader(object): self.mobi6_reader, self.log = mobi6_reader, log self.header = mobi6_reader.book_header self.encrypted_fonts = [] + self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') + self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') + self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') def __call__(self): self.mobi6_reader.check_for_drm() + self.aid_anchor_suffix = bytes(uuid4().hex) bh = self.mobi6_reader.book_header if self.mobi6_reader.kf8_type == 'joint': offset = self.mobi6_reader.kf8_boundary + 2 @@ -94,6 +99,7 @@ class Mobi8Reader(object): self.kf8_sections = self.mobi6_reader.sections[offset-1:] self.cover_offset = getattr(self.header.exth, 'cover_offset', None) + self.linked_aids = set() self.read_indices() self.build_parts() @@ -317,12 +323,14 @@ class Mobi8Reader(object): if plt == npos or pgt < plt: npos = pgt + 1 textblock = textblock[0:npos] - id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') - name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') for tag in reverse_tag_iter(textblock): - m = id_re.match(tag) or name_re.match(tag) + m = self.id_re.match(tag) or self.name_re.match(tag) if m is not None: return m.group(1) + m = self.aid_re.match(tag) + if m is not None: + self.linked_aids.add(m.group(1)) + return m.group(1) + b'-' + self.aid_anchor_suffix # No tag found, link to start of file return b''