AZW3 Input: Handle kindlegen produced azw3 files that do not use normal HTML anchors for linking.

This commit is contained in:
Kovid Goyal 2015-04-28 09:53:53 +05:30
parent c617a95672
commit e198d427b5
2 changed files with 21 additions and 8 deletions

View File

@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log):
# All parts are now unicode and have no internal links # All parts are now unicode and have no internal links
return parts return parts
def remove_kindlegen_markup(parts): def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
# we can safely remove all of the Kindlegen generated aid tags # we can safely remove all of the Kindlegen generated aid tags
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
re.IGNORECASE) re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''') within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''')
for i in xrange(len(parts)): for i in xrange(len(parts)):
part = parts[i] part = parts[i]
@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts):
tag = srcpieces[j] tag = srcpieces[j]
if tag.startswith('<'): if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag): for m in within_tag_aid_position_pattern.finditer(tag):
try:
aid = m.group(1)
except IndexError:
aid = None
replacement = '' replacement = ''
tag = within_tag_aid_position_pattern.sub(replacement, tag, if aid in linked_aids:
1) replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag srcpieces[j] = tag
part = "".join(srcpieces) part = "".join(srcpieces)
parts[i] = part parts[i] = part
@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
parts = update_internal_links(mobi8_reader, log) parts = update_internal_links(mobi8_reader, log)
# Remove pointless markup inserted by kindlegen # Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts) remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
# Handle substitutions for the flows pieces first as they may # Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text # be inlined into the xhtml text

View File

@ -11,6 +11,7 @@ import struct, re, os
from collections import namedtuple from collections import namedtuple
from itertools import repeat, izip from itertools import repeat, izip
from urlparse import urldefrag from urlparse import urldefrag
from uuid import uuid4
from lxml import etree from lxml import etree
@ -72,9 +73,13 @@ class Mobi8Reader(object):
self.mobi6_reader, self.log = mobi6_reader, log self.mobi6_reader, self.log = mobi6_reader, log
self.header = mobi6_reader.book_header self.header = mobi6_reader.book_header
self.encrypted_fonts = [] self.encrypted_fonts = []
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
def __call__(self): def __call__(self):
self.mobi6_reader.check_for_drm() self.mobi6_reader.check_for_drm()
self.aid_anchor_suffix = bytes(uuid4().hex)
bh = self.mobi6_reader.book_header bh = self.mobi6_reader.book_header
if self.mobi6_reader.kf8_type == 'joint': if self.mobi6_reader.kf8_type == 'joint':
offset = self.mobi6_reader.kf8_boundary + 2 offset = self.mobi6_reader.kf8_boundary + 2
@ -94,6 +99,7 @@ class Mobi8Reader(object):
self.kf8_sections = self.mobi6_reader.sections[offset-1:] self.kf8_sections = self.mobi6_reader.sections[offset-1:]
self.cover_offset = getattr(self.header.exth, 'cover_offset', None) self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
self.linked_aids = set()
self.read_indices() self.read_indices()
self.build_parts() self.build_parts()
@ -317,12 +323,14 @@ class Mobi8Reader(object):
if plt == npos or pgt < plt: if plt == npos or pgt < plt:
npos = pgt + 1 npos = pgt + 1
textblock = textblock[0:npos] textblock = textblock[0:npos]
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
for tag in reverse_tag_iter(textblock): for tag in reverse_tag_iter(textblock):
m = id_re.match(tag) or name_re.match(tag) m = self.id_re.match(tag) or self.name_re.match(tag)
if m is not None: if m is not None:
return m.group(1) return m.group(1)
m = self.aid_re.match(tag)
if m is not None:
self.linked_aids.add(m.group(1))
return m.group(1) + b'-' + self.aid_anchor_suffix
# No tag found, link to start of file # No tag found, link to start of file
return b'' return b''