AZW3 Input: Handle kindlegen produced azw3 files that do not use normal HTML anchors for linking.

This commit is contained in:
Kovid Goyal 2015-04-28 09:53:53 +05:30
parent c617a95672
commit e198d427b5
2 changed files with 21 additions and 8 deletions

View File

@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log):
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts):
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
# we can safely remove all of the Kindlegen generated aid tags
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''')
for i in xrange(len(parts)):
part = parts[i]
@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
try:
aid = m.group(1)
except IndexError:
aid = None
replacement = ''
tag = within_tag_aid_position_pattern.sub(replacement, tag,
1)
if aid in linked_aids:
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
parts = update_internal_links(mobi8_reader, log)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts)
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text

View File

@ -11,6 +11,7 @@ import struct, re, os
from collections import namedtuple
from itertools import repeat, izip
from urlparse import urldefrag
from uuid import uuid4
from lxml import etree
@ -72,9 +73,13 @@ class Mobi8Reader(object):
self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header
self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
def __call__(self):
self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = bytes(uuid4().hex)
bh = self.mobi6_reader.book_header
if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2
@ -94,6 +99,7 @@ class Mobi8Reader(object):
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.linked_aids = set()
self.read_indices()
self.build_parts()
@ -317,12 +323,14 @@ class Mobi8Reader(object):
if plt == npos or pgt < plt:
npos = pgt + 1
textblock = textblock[0:npos]
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
for tag in reverse_tag_iter(textblock):
m = id_re.match(tag) or name_re.match(tag)
m = self.id_re.match(tag) or self.name_re.match(tag)
if m is not None:
return m.group(1)
m = self.aid_re.match(tag)
if m is not None:
self.linked_aids.add(m.group(1))
return m.group(1) + b'-' + self.aid_anchor_suffix
# No tag found, link to start of file
return b''