mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
AZW3 Input: Handle kindlegen produced azw3 files that do not use normal HTML anchors for linking.
This commit is contained in:
parent
c617a95672
commit
e198d427b5
@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log):
|
||||
# All parts are now unicode and have no internal links
|
||||
return parts
|
||||
|
||||
def remove_kindlegen_markup(parts):
|
||||
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
|
||||
|
||||
# we can safely remove all of the Kindlegen generated aid tags
|
||||
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
|
||||
re.IGNORECASE)
|
||||
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
|
||||
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''')
|
||||
|
||||
for i in xrange(len(parts)):
|
||||
part = parts[i]
|
||||
@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||
try:
|
||||
aid = m.group(1)
|
||||
except IndexError:
|
||||
aid = None
|
||||
replacement = ''
|
||||
tag = within_tag_aid_position_pattern.sub(replacement, tag,
|
||||
1)
|
||||
if aid in linked_aids:
|
||||
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
|
||||
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = "".join(srcpieces)
|
||||
parts[i] = part
|
||||
@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
||||
parts = update_internal_links(mobi8_reader, log)
|
||||
|
||||
# Remove pointless markup inserted by kindlegen
|
||||
remove_kindlegen_markup(parts)
|
||||
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
|
||||
|
||||
# Handle substitutions for the flows pieces first as they may
|
||||
# be inlined into the xhtml text
|
||||
|
@ -11,6 +11,7 @@ import struct, re, os
|
||||
from collections import namedtuple
|
||||
from itertools import repeat, izip
|
||||
from urlparse import urldefrag
|
||||
from uuid import uuid4
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -72,9 +73,13 @@ class Mobi8Reader(object):
|
||||
self.mobi6_reader, self.log = mobi6_reader, log
|
||||
self.header = mobi6_reader.book_header
|
||||
self.encrypted_fonts = []
|
||||
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
|
||||
def __call__(self):
|
||||
self.mobi6_reader.check_for_drm()
|
||||
self.aid_anchor_suffix = bytes(uuid4().hex)
|
||||
bh = self.mobi6_reader.book_header
|
||||
if self.mobi6_reader.kf8_type == 'joint':
|
||||
offset = self.mobi6_reader.kf8_boundary + 2
|
||||
@ -94,6 +99,7 @@ class Mobi8Reader(object):
|
||||
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
|
||||
|
||||
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
|
||||
self.linked_aids = set()
|
||||
|
||||
self.read_indices()
|
||||
self.build_parts()
|
||||
@ -317,12 +323,14 @@ class Mobi8Reader(object):
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
textblock = textblock[0:npos]
|
||||
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
m = id_re.match(tag) or name_re.match(tag)
|
||||
m = self.id_re.match(tag) or self.name_re.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
m = self.aid_re.match(tag)
|
||||
if m is not None:
|
||||
self.linked_aids.add(m.group(1))
|
||||
return m.group(1) + b'-' + self.aid_anchor_suffix
|
||||
|
||||
# No tag found, link to start of file
|
||||
return b''
|
||||
|
Loading…
x
Reference in New Issue
Block a user