mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
AZW3 Input: Handle kindlegen produced azw3 files that do not use normal HTML anchors for linking.
This commit is contained in:
parent
c617a95672
commit
e198d427b5
@ -54,12 +54,12 @@ def update_internal_links(mobi8_reader, log):
|
|||||||
# All parts are now unicode and have no internal links
|
# All parts are now unicode and have no internal links
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
def remove_kindlegen_markup(parts):
|
def remove_kindlegen_markup(parts, aid_anchor_suffix, linked_aids):
|
||||||
|
|
||||||
# we can safely remove all of the Kindlegen generated aid tags
|
# we can safely remove all of the Kindlegen generated aid tags
|
||||||
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
|
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
|
||||||
re.IGNORECASE)
|
re.IGNORECASE)
|
||||||
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
|
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"]([^'"]*)['"]''')
|
||||||
|
|
||||||
for i in xrange(len(parts)):
|
for i in xrange(len(parts)):
|
||||||
part = parts[i]
|
part = parts[i]
|
||||||
@ -68,9 +68,14 @@ def remove_kindlegen_markup(parts):
|
|||||||
tag = srcpieces[j]
|
tag = srcpieces[j]
|
||||||
if tag.startswith('<'):
|
if tag.startswith('<'):
|
||||||
for m in within_tag_aid_position_pattern.finditer(tag):
|
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||||
|
try:
|
||||||
|
aid = m.group(1)
|
||||||
|
except IndexError:
|
||||||
|
aid = None
|
||||||
replacement = ''
|
replacement = ''
|
||||||
tag = within_tag_aid_position_pattern.sub(replacement, tag,
|
if aid in linked_aids:
|
||||||
1)
|
replacement = ' id="%s"' % (aid + '-' + aid_anchor_suffix)
|
||||||
|
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
||||||
srcpieces[j] = tag
|
srcpieces[j] = tag
|
||||||
part = "".join(srcpieces)
|
part = "".join(srcpieces)
|
||||||
parts[i] = part
|
parts[i] = part
|
||||||
@ -331,7 +336,7 @@ def expand_mobi8_markup(mobi8_reader, resource_map, log):
|
|||||||
parts = update_internal_links(mobi8_reader, log)
|
parts = update_internal_links(mobi8_reader, log)
|
||||||
|
|
||||||
# Remove pointless markup inserted by kindlegen
|
# Remove pointless markup inserted by kindlegen
|
||||||
remove_kindlegen_markup(parts)
|
remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids)
|
||||||
|
|
||||||
# Handle substitutions for the flows pieces first as they may
|
# Handle substitutions for the flows pieces first as they may
|
||||||
# be inlined into the xhtml text
|
# be inlined into the xhtml text
|
||||||
|
@ -11,6 +11,7 @@ import struct, re, os
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from itertools import repeat, izip
|
from itertools import repeat, izip
|
||||||
from urlparse import urldefrag
|
from urlparse import urldefrag
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -72,9 +73,13 @@ class Mobi8Reader(object):
|
|||||||
self.mobi6_reader, self.log = mobi6_reader, log
|
self.mobi6_reader, self.log = mobi6_reader, log
|
||||||
self.header = mobi6_reader.book_header
|
self.header = mobi6_reader.book_header
|
||||||
self.encrypted_fonts = []
|
self.encrypted_fonts = []
|
||||||
|
self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
|
||||||
def __call__(self):
|
def __call__(self):
|
||||||
self.mobi6_reader.check_for_drm()
|
self.mobi6_reader.check_for_drm()
|
||||||
|
self.aid_anchor_suffix = bytes(uuid4().hex)
|
||||||
bh = self.mobi6_reader.book_header
|
bh = self.mobi6_reader.book_header
|
||||||
if self.mobi6_reader.kf8_type == 'joint':
|
if self.mobi6_reader.kf8_type == 'joint':
|
||||||
offset = self.mobi6_reader.kf8_boundary + 2
|
offset = self.mobi6_reader.kf8_boundary + 2
|
||||||
@ -94,6 +99,7 @@ class Mobi8Reader(object):
|
|||||||
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
|
self.kf8_sections = self.mobi6_reader.sections[offset-1:]
|
||||||
|
|
||||||
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
|
self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
|
||||||
|
self.linked_aids = set()
|
||||||
|
|
||||||
self.read_indices()
|
self.read_indices()
|
||||||
self.build_parts()
|
self.build_parts()
|
||||||
@ -317,12 +323,14 @@ class Mobi8Reader(object):
|
|||||||
if plt == npos or pgt < plt:
|
if plt == npos or pgt < plt:
|
||||||
npos = pgt + 1
|
npos = pgt + 1
|
||||||
textblock = textblock[0:npos]
|
textblock = textblock[0:npos]
|
||||||
id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
|
|
||||||
name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
|
|
||||||
for tag in reverse_tag_iter(textblock):
|
for tag in reverse_tag_iter(textblock):
|
||||||
m = id_re.match(tag) or name_re.match(tag)
|
m = self.id_re.match(tag) or self.name_re.match(tag)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
|
m = self.aid_re.match(tag)
|
||||||
|
if m is not None:
|
||||||
|
self.linked_aids.add(m.group(1))
|
||||||
|
return m.group(1) + b'-' + self.aid_anchor_suffix
|
||||||
|
|
||||||
# No tag found, link to start of file
|
# No tag found, link to start of file
|
||||||
return b''
|
return b''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user