312 lines
12 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os
def update_internal_links(mobi8_reader):
# need to update all links that are internal which
# are based on positions within the xhtml files **BEFORE**
# cutting and pasting any pieces into the xhtml text files
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
# XXXX is the offset in records into divtbl
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
mr = mobi8_reader
# pos:fid pattern
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
parts = []
for part in mr.parts:
srcpieces = posfid_pattern.split(part)
for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(b'<'):
for m in posfid_index_pattern.finditer(tag):
posfid = m.group(1)
offset = m.group(2)
filename, idtag = mr.get_id_tag_by_pos_fid(int(posfid, 32),
int(offset, 32))
suffix = (b'#' + idtag) if idtag else b''
replacement = filename.split('/')[-1].encode(
mr.header.codec) + suffix
tag = posfid_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = ''.join([x.decode(mr.header.codec) for x in srcpieces])
parts.append(part)
# All parts are now unicode and have no internal links
return parts
def remove_kindlegen_markup(parts):
# we can safely remove all of the Kindlegen generated aid tags
find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''',
re.IGNORECASE)
within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_aid_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_aid_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_aid_position_pattern.sub(replacement, tag,
1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
for j in range(len(srcpieces)):
tag = srcpieces[j]
if tag.startswith('<'):
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
replacement = ''
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
parts[i] = part
def update_flow_links(mobi8_reader, resource_map, log):
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
# kindle:embed:XXXX (used for fonts)
mr = mobi8_reader
flows = []
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''', re.IGNORECASE)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE)
url_img_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*''', re.IGNORECASE)
font_index_pattern = re.compile(r'''kindle:embed:([0-9|A-V]+)''', re.IGNORECASE)
url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
for flow in mr.flows:
if flow is None: # 0th flow is None
flows.append(flow)
continue
if not isinstance(flow, unicode):
flow = flow.decode(mr.header.codec)
# links to raster image files from image tags
# image_pattern
srcpieces = img_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized '
'as a valid image in %s' % (num, tag))
srcpieces[j] = tag
flow = "".join(srcpieces)
# replacements inside css url():
srcpieces = url_pattern.split(flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
# process links to raster image files
for m in url_img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../'+ href)
tag = url_img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as a '
'valid image in %s' % (num, tag))
# process links to fonts
for m in font_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href is None:
log.warn('Referenced font %s was not recognized as a '
'valid font in %s' % (num, tag))
else:
replacement = '"%s"'%('../'+ href)
if href.endswith('.failed'):
replacement = '"%s"'%('failed-'+href)
tag = font_index_pattern.sub(replacement, tag, 1)
# process links to other css pieces
for m in url_css_index_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = url_css_index_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
# flow pattern not inside url()
srcpieces = re.split(tag_pattern, flow)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in re.finditer(flow_pattern, tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
flowtext = mr.flows[num]
tag = flowtext
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
flow = "".join(srcpieces)
flows.append(flow)
# All flows are now unicode and have links resolved
return flows
def insert_flows_into_markup(parts, flows, mobi8_reader):
mr = mobi8_reader
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
tag_pattern = re.compile(r'''(<[^>]*>)''')
flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# flow pattern
srcpieces = tag_pattern.split(part)
for j in range(1, len(srcpieces),2):
tag = srcpieces[j]
if tag.startswith('<'):
for m in flow_pattern.finditer(tag):
num = int(m.group(1), 32)
fi = mr.flowinfo[num]
if fi.format == 'inline':
tag = flows[num]
else:
replacement = '"../' + fi.dir + '/' + fi.fname + '"'
tag = flow_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
for i in xrange(len(parts)):
part = parts[i]
#[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
# links to raster image files
# image_pattern
srcpieces = img_pattern.split(part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith('<im'):
for m in img_index_pattern.finditer(tag):
num = int(m.group(1), 32)
href = resource_map[num-1]
if href:
replacement = '"%s"'%('../' + href)
tag = img_index_pattern.sub(replacement, tag, 1)
else:
log.warn('Referenced image %s was not recognized as '
'a valid image in %s' % (num, tag))
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def upshift_markup(parts):
tag_pattern = re.compile(r'''(<(?:svg)[^>]*>)''', re.IGNORECASE)
for i in xrange(len(parts)):
part = parts[i]
# tag pattern
srcpieces = re.split(tag_pattern, part)
for j in range(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag[:4].lower() == '<svg':
tag = tag.replace('preserveaspectratio','preserveAspectRatio')
tag = tag.replace('viewbox','viewBox')
srcpieces[j] = tag
part = "".join(srcpieces)
# store away modified version
parts[i] = part
def expand_mobi8_markup(mobi8_reader, resource_map, log):
# First update all internal links that are based on offsets
parts = update_internal_links(mobi8_reader)
# Remove pointless markup inserted by kindlegen
remove_kindlegen_markup(parts)
# Handle substitutions for the flows pieces first as they may
# be inlined into the xhtml text
flows = update_flow_links(mobi8_reader, resource_map, log)
# Insert inline flows into the markup
insert_flows_into_markup(parts, flows, mobi8_reader)
# Insert raster images into markup
insert_images_into_markup(parts, resource_map, log)
# Perform general markup cleanups
upshift_markup(parts)
# Update the parts and flows stored in the reader
mobi8_reader.parts = parts
mobi8_reader.flows = flows
# write out the parts and file flows
os.mkdir('text') # directory containing all parts
spine = []
for i, part in enumerate(parts):
pi = mobi8_reader.partinfo[i]
with open(os.path.join(pi.type, pi.filename), 'wb') as f:
f.write(part.encode('utf-8'))
spine.append(f.name)
for i, flow in enumerate(flows):
fi = mobi8_reader.flowinfo[i]
if fi.format == 'file':
if not os.path.exists(fi.dir):
os.mkdir(fi.dir)
with open(os.path.join(fi.dir, fi.fname), 'wb') as f:
f.write(flow.encode('utf-8'))
return spine