'
__docformat__ = 'restructuredtext en'
-from cStringIO import StringIO
-
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
@@ -79,18 +77,9 @@ class MOBIOutput(OutputFormatPlugin):
def check_for_masthead(self):
found = 'masthead' in self.oeb.guide
if not found:
+ from calibre.ebooks import generate_masthead
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
- try:
- from PIL import Image as PILImage
- PILImage
- except ImportError:
- import Image as PILImage
-
- raw = open(P('content_server/calibre_banner.png'), 'rb')
- im = PILImage.open(raw)
- of = StringIO()
- im.save(of, 'GIF')
- raw = of.getvalue()
+ raw = generate_masthead(unicode(self.oeb.metadata['title'][0]))
id, href = self.oeb.manifest.generate('masthead', 'masthead')
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
self.oeb.guide.add('masthead', 'Masthead Image', href)
@@ -151,17 +140,63 @@ class MOBIOutput(OutputFormatPlugin):
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
+ def remove_html_cover(self):
+ from calibre.ebooks.oeb.base import OEB_DOCS
+
+ oeb = self.oeb
+ if not oeb.metadata.cover \
+ or 'cover' not in oeb.guide:
+ return
+ href = oeb.guide['cover'].href
+ del oeb.guide['cover']
+ item = oeb.manifest.hrefs[href]
+ if item.spine_position is not None:
+ self.log.warn('Found an HTML cover: ', item.href, 'removing it.',
+ 'If you find some content missing from the output MOBI, it '
+ 'is because you misidentified the HTML cover in the input '
+ 'document')
+ oeb.spine.remove(item)
+ if item.media_type in OEB_DOCS:
+ self.oeb.manifest.remove(item)
+
def convert(self, oeb, output_path, input_plugin, opts, log):
+ from calibre.utils.config import tweaks
+ from calibre.ebooks.mobi.writer2.resources import Resources
self.log, self.opts, self.oeb = log, opts, oeb
- kf8 = self.create_kf8()
- self.write_mobi(input_plugin, output_path, kf8)
+ mobi_type = tweaks.get('test_mobi_output_type', 'old')
+ if self.is_periodical:
+ mobi_type = 'old' # Amazon does not support KF8 periodicals
+ create_kf8 = mobi_type in ('new', 'both')
- def create_kf8(self):
- from calibre.ebooks.mobi.writer8.main import KF8Writer
- return KF8Writer(self.oeb, self.opts)
+ self.remove_html_cover()
+ resources = Resources(oeb, opts, self.is_periodical,
+ add_fonts=create_kf8)
+ self.check_for_periodical()
- def write_mobi(self, input_plugin, output_path, kf8):
+ if create_kf8:
+ # Split on pagebreaks so that the resulting KF8 works better with
+ # calibre's viewer, which does not support CSS page breaks
+ from calibre.ebooks.oeb.transforms.split import Split
+ Split()(self.oeb, self.opts)
+
+
+ kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
+ ) if create_kf8 else None
+ if mobi_type == 'new':
+ kf8.write(output_path)
+ self.extract_mobi(output_path, opts)
+ return
+
+ self.log('Creating MOBI 6 output')
+ self.write_mobi(input_plugin, output_path, kf8, resources)
+
+ def create_kf8(self, resources, for_joint=False):
+ from calibre.ebooks.mobi.writer8.main import create_kf8_book
+ return create_kf8_book(self.oeb, self.opts, resources,
+ for_joint=for_joint)
+
+ def write_mobi(self, input_plugin, output_path, kf8, resources):
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
@@ -180,15 +215,19 @@ class MOBIOutput(OutputFormatPlugin):
rasterizer(oeb, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
+ else:
+ # Add rasterized SVG images
+ resources.add_extra_images()
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
- self.check_for_periodical()
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
- writer = MobiWriter(opts,
+ writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
+ self.extract_mobi(output_path, opts)
+ def extract_mobi(self, output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index dbba38e987..14bc0a0b78 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -536,7 +536,7 @@ OptionRecommendation(name='pubdate',
OptionRecommendation(name='timestamp',
recommended_value=None, level=OptionRecommendation.LOW,
- help=_('Set the book timestamp (used by the date column in calibre).')),
+ help=_('Set the book timestamp (no longer used anywhere)')),
OptionRecommendation(name='enable_heuristics',
recommended_value=False, level=OptionRecommendation.LOW,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c526cba8a9..16acaad383 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f154764515..2c1a5cd4d3 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -148,6 +148,7 @@ class HeuristicProcessor(object):
return wordcount.words
def markup_italicis(self, html):
+ self.log.debug("\n\n\nitalicize debugging \n\n\n")
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
@@ -156,28 +157,30 @@ class HeuristicProcessor(object):
]
ITALICIZE_STYLE_PATS = [
- ur'(?msu)(?<=[\s>"“\'‘])_(?P[^_]+)_',
- ur'(?msu)(?<=[\s>"“\'‘])/(?P[^/\*><]+)/',
+ ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"“\'‘])~~(?P[^~]+)~~',
- ur'(?msu)(?<=[\s>"“\'‘])\*(?P[^\*]+)\*',
- ur'(?msu)(?<=[\s>"“\'‘])~(?P[^~]+)~',
ur'(?msu)(?<=[\s>"“\'‘])_/(?P[^/_]+)/_',
ur'(?msu)(?<=[\s>"“\'‘])_\*(?P[^\*_]+)\*_',
ur'(?msu)(?<=[\s>"“\'‘])\*/(?P[^/\*]+)/\*',
- ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"“\'‘])/:(?P[^:/]+):/',
ur'(?msu)(?<=[\s>"“\'‘])\|:(?P[^:\|]+):\|',
+ ur'(?msu)(?<=[\s>"“\'‘])\*(?P[^\*]+)\*',
+ ur'(?msu)(?<=[\s>"“\'‘])~(?P[^~]+)~',
+ ur'(?msu)(?<=[\s>"“\'‘])/(?P[^/\*><]+)/',
+ ur'(?msu)(?<=[\s>"“\'‘])_(?P[^_]+)_'
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '%s' % word, html)
- def sub(mo):
- return '%s'%mo.group('words')
-
+ search_text = re.sub(r'(?s)]*>.*?', '', html)
+ search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
- html = re.sub(pat, sub, html)
-
+ for match in re.finditer(pat, search_text):
+ ital_string = str(match.group('words'))
+ #self.log.debug("italicising "+str(match.group(0))+" with "+ital_string+"")
+ html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html)
+
return html
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
@@ -316,13 +319,20 @@ class HeuristicProcessor(object):
'''
Unwraps lines based on line length and punctuation
supports a range of html markup and text files
+
+ the lookahead regex below is meant look for any non-full stop characters - punctuation
+ characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+ the reason for this is to prevent false positive wrapping. False positives are more
+ difficult to detect than false negatives during a manual review of the doc
+
+ This function intentionally leaves hyphenated content alone as that is handled by the
+ dehyphenate routine in a separate step
'''
- # define the pieces of the regex
- lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*((span|[iubp]|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -331,23 +341,19 @@ class HeuristicProcessor(object):
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
- dash_unwrap_regex = dash+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
- dash_unwrap_regex = dash+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
- dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
- content = dash_unwrap.sub('', content)
return content
def txt_process(self, match):
@@ -460,27 +466,31 @@ class HeuristicProcessor(object):
return html
def detect_whitespace(self, html):
- blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+ blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+ blanks_around_scene_breaks = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P]*>.*?
)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
- endblanks = match.group('initparas')
- heading = match.group('heading')
+ endblanks = match.group('endparas')
+ content = match.group('content')
top_margin = ''
bottom_margin = ''
if initblanks is not None:
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
if endblanks is not None:
- bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+ bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
if initblanks == None and endblanks == None:
- return heading
+ return content
+ elif content.find('scenebreak') != -1:
+ return content
else:
- heading = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', heading)
- return heading
+ content = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content)
+ return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
+ html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
def markup_whitespaces(match):
blanks = match.group(0)
@@ -515,6 +525,12 @@ class HeuristicProcessor(object):
html = self.blankreg.sub('\n
', html)
return html
+ def detect_scene_breaks(self, html):
+ scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+ scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+ html = scene_breaks.sub(self.scene_break_open+'\g'+'
', html)
+ return html
+
def markup_user_break(self, replacement_break):
'''
Takes string a user supplies and wraps it in markup that will be centered with
@@ -781,25 +797,25 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)]*>\s*
\s*
', '', html)
+ html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
- scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
- scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+ detected_scene_break = re.compile(r']*>.*?
')
+ scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
- if len(scene_break.findall(html)) >= 1:
- html = scene_break.sub(replacement_break, html)
+ if scene_break_count >= 1:
+ html = detected_scene_break.sub(replacement_break, html)
+ html = re.sub(']*>\s*
', replacement_break, html)
else:
html = re.sub(']*>\s*
', replacement_break, html)
- else:
- html = scene_break.sub(self.scene_break_open+'\g'+'', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index b45f8f9f9e..b846d76a95 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -18,6 +18,7 @@ from lxml import etree
from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__
from calibre.utils.magick import Image
+from calibre.utils.localization import lang_as_iso639_1
class FB2MLizer(object):
'''
@@ -103,7 +104,10 @@ class FB2MLizer(object):
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
if self.oeb_book.metadata.language:
- metadata['lang'] = self.oeb_book.metadata.language[0].value
+ lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
+ if not lc:
+ lc = self.oeb_book.metadata.language[0].value
+ metadata['lang'] = lc or 'en'
else:
metadata['lang'] = u'en'
metadata['id'] = None
diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index bb1bbb9d42..6d6ebd3990 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -197,14 +197,18 @@ class OverDrive(Source):
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
- if len(title_tokens) >= len(author_tokens):
+ xref_q = ''
+ if len(author_tokens) <= 1:
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
- xref_q = '+'.join(title_tokens)
- #log.error('Initial query is %s'%initial_q)
- #log.error('Cross reference query is %s'%xref_q)
+ for token in title_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+
+ log.error('Initial query is %s'%initial_q)
+ log.error('Cross reference query is %s'%xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
@@ -219,27 +223,30 @@ class OverDrive(Source):
# get the search results object
results = False
+ iterations = 0
while results == False:
+ iterations += 1
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw):
- if int(m.group('displayrecords')) >= 1:
- results = True
- elif int(m.group('totalrecords')) >= 1:
- if int(m.group('totalrecords')) >= 100:
- if xref_q.find('+') != -1:
- xref_tokens = xref_q.split('+')
- xref_q = xref_tokens[0]
- #log.error('xref_q is '+xref_q)
- else:
- xref_q = ''
- xref_q = ''
- q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
- elif int(m.group('totalrecords')) == 0:
+ if int(m.group('totalrecords')) == 0:
return ''
+ elif int(m.group('displayrecords')) >= 1:
+ results = True
+ elif int(m.group('totalrecords')) >= 1 and iterations < 3:
+ if xref_q.find('+') != -1:
+ xref_tokens = xref_q.split('+')
+ xref_q = xref_tokens[0]
+ for token in xref_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+ #log.error('rewrote xref_q, new query is '+xref_q)
+ else:
+ xref_q = ''
+ q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
@@ -263,6 +270,7 @@ class OverDrive(Source):
else:
if creators:
creators = creators.split(', ')
+
# if an exact match in a preferred format occurs
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
@@ -330,9 +338,9 @@ class OverDrive(Source):
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
- return self.overdrive_search(br, log, q, title, author)
+ return self.overdrive_search(br, log, q, title, author)
else:
- return self.overdrive_get_record(br, log, q, ovrdrv_id)
+ return self.overdrive_get_record(br, log, q, ovrdrv_id)
@@ -461,10 +469,10 @@ if __name__ == '__main__':
[
(
- {'title':'Foundation and Earth',
- 'authors':['Asimov']},
- [title_test('Foundation and Earth', exact=True),
- authors_test(['Isaac Asimov'])]
+ {'title':'The Sea Kings Daughter',
+ 'authors':['Elizabeth Peters']},
+ [title_test('The Sea Kings Daughter', exact=False),
+ authors_test(['Elizabeth Peters'])]
),
(
diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py
index 07a3fa91b9..77a31606e2 100644
--- a/src/calibre/ebooks/mobi/debug/headers.py
+++ b/src/calibre/ebooks/mobi/debug/headers.py
@@ -295,19 +295,17 @@ class MOBIHeader(object): # {{{
self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
self.has_exth = bool(self.exth_flags & 0x40)
- self.has_drm_data = self.length >= 174 and len(self.raw) >= 180
+ self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
if self.has_drm_data:
- self.unknown3 = self.raw[132:164]
- self.drm_offset, = struct.unpack(b'>I', self.raw[164:168])
- self.drm_count, = struct.unpack(b'>I', self.raw[168:172])
- self.drm_size, = struct.unpack(b'>I', self.raw[172:176])
- self.drm_flags = bin(struct.unpack(b'>I', self.raw[176:180])[0])
+ self.unknown3 = self.raw[132:168]
+ self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
+ struct.unpack(b'>4I', self.raw[168:184])
self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
self.has_fcis_flis = False
self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
self.extra_data_flags = 0
if self.has_extra_data_flags:
- self.unknown4 = self.raw[180:192]
+ self.unknown4 = self.raw[184:192]
self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
self.raw, 192)
if self.fdst_count <= 1:
@@ -329,7 +327,7 @@ class MOBIHeader(object): # {{{
self.primary_index_record, = struct.unpack(b'>I',
self.raw[244:248])
- if self.file_version >= 8:
+ if self.length >= 248:
(self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
) = struct.unpack_from(b'>4L', self.raw, 248)
self.unknown9 = self.raw[264:self.length]
@@ -339,11 +337,12 @@ class MOBIHeader(object): # {{{
# The following are all relative to the position of the header record
# make them absolute for ease of debugging
- for x in ('sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
+ self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
'meta_orth_indx', 'huffman_record_offset',
'first_non_book_record', 'datp_record_offset', 'fcis_number',
'flis_number', 'primary_index_record', 'fdst_idx',
- 'first_image_index'):
+ 'first_image_index'}
+ for x in self.relative_records:
if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
setattr(self, x, self.header_offset+getattr(self, x))
@@ -357,70 +356,79 @@ class MOBIHeader(object): # {{{
def __str__(self):
ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20]
+
a = ans.append
- i = lambda d, x : a('%s (null value: %d): %d'%(d, NULL_INDEX, x))
- ans.append('Compression: %s'%self.compression)
- ans.append('Unused: %r'%self.unused)
- ans.append('Number of text records: %d'%self.number_of_text_records)
- ans.append('Text record size: %d'%self.text_record_size)
- ans.append('Encryption: %s'%self.encryption_type)
- ans.append('Unknown: %r'%self.unknown)
- ans.append('Identifier: %r'%self.identifier)
- ans.append('Header length: %d'% self.length)
- ans.append('Type: %s'%self.type)
- ans.append('Encoding: %s'%self.encoding)
- ans.append('UID: %r'%self.uid)
- ans.append('File version: %d'%self.file_version)
- i('Meta Orth Index (Sections index in KF8)', self.meta_orth_indx)
- i('Meta Infl Index', self.meta_infl_indx)
- ans.append('Secondary index record: %d (null val: %d)'%(
- self.secondary_index_record, NULL_INDEX))
- ans.append('Reserved: %r'%self.reserved)
- ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX,
- self.first_non_book_record))
- ans.append('Full name offset: %d'%self.fullname_offset)
- ans.append('Full name length: %d bytes'%self.fullname_length)
- ans.append('Langcode: %r'%self.locale_raw)
- ans.append('Language: %s'%self.language)
- ans.append('Sub language: %s'%self.sublanguage)
- ans.append('Input language: %r'%self.input_language)
- ans.append('Output language: %r'%self.output_langauage)
- ans.append('Min version: %d'%self.min_version)
- ans.append('First Image index: %d'%self.first_image_index)
- ans.append('Huffman record offset: %d'%self.huffman_record_offset)
- ans.append('Huffman record count: %d'%self.huffman_record_count)
- ans.append('DATP record offset: %r'%self.datp_record_offset)
- ans.append('DATP record count: %r'%self.datp_record_count)
- ans.append('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
+
+ def i(d, x):
+ x = 'NULL' if x == NULL_INDEX else x
+ a('%s: %s'%(d, x))
+
+ def r(d, attr):
+ x = getattr(self, attr)
+ if attr in self.relative_records and x != NULL_INDEX:
+ a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset))
+ else:
+ i(d, x)
+
+ a('Compression: %s'%self.compression)
+ a('Unused: %r'%self.unused)
+ a('Number of text records: %d'%self.number_of_text_records)
+ a('Text record size: %d'%self.text_record_size)
+ a('Encryption: %s'%self.encryption_type)
+ a('Unknown: %r'%self.unknown)
+ a('Identifier: %r'%self.identifier)
+ a('Header length: %d'% self.length)
+ a('Type: %s'%self.type)
+ a('Encoding: %s'%self.encoding)
+ a('UID: %r'%self.uid)
+ a('File version: %d'%self.file_version)
+ r('Meta Orth Index', 'meta_orth_indx')
+ r('Meta Infl Index', 'meta_infl_indx')
+ r('Secondary index record', 'secondary_index_record')
+ a('Reserved: %r'%self.reserved)
+ r('First non-book record', 'first_non_book_record')
+ a('Full name offset: %d'%self.fullname_offset)
+ a('Full name length: %d bytes'%self.fullname_length)
+ a('Langcode: %r'%self.locale_raw)
+ a('Language: %s'%self.language)
+ a('Sub language: %s'%self.sublanguage)
+ a('Input language: %r'%self.input_language)
+ a('Output language: %r'%self.output_langauage)
+ a('Min version: %d'%self.min_version)
+ r('First Image index', 'first_image_index')
+ r('Huffman record offset', 'huffman_record_offset')
+ a('Huffman record count: %d'%self.huffman_record_count)
+ r('DATP record offset', 'datp_record_offset')
+ a('DATP record count: %r'%self.datp_record_count)
+ a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth))
if self.has_drm_data:
- ans.append('Unknown3: %r'%self.unknown3)
- ans.append('DRM Offset: %s'%self.drm_offset)
- ans.append('DRM Count: %s'%self.drm_count)
- ans.append('DRM Size: %s'%self.drm_size)
- ans.append('DRM Flags: %r'%self.drm_flags)
+ a('Unknown3: %r'%self.unknown3)
+ r('DRM Offset', 'drm_offset')
+ a('DRM Count: %s'%self.drm_count)
+ a('DRM Size: %s'%self.drm_size)
+ a('DRM Flags: %r'%self.drm_flags)
if self.has_extra_data_flags:
- ans.append('Unknown4: %r'%self.unknown4)
- ans.append('FDST Index: %d'% self.fdst_idx)
- ans.append('FDST Count: %d'% self.fdst_count)
- ans.append('FCIS number: %d'% self.fcis_number)
- ans.append('FCIS count: %d'% self.fcis_count)
- ans.append('FLIS number: %d'% self.flis_number)
- ans.append('FLIS count: %d'% self.flis_count)
- ans.append('Unknown6: %r'% self.unknown6)
- ans.append('SRCS record index: %d'%self.srcs_record_index)
- ans.append('Number of SRCS records?: %d'%self.num_srcs_records)
- ans.append('Unknown7: %r'%self.unknown7)
- ans.append(('Extra data flags: %s (has multibyte: %s) '
+ a('Unknown4: %r'%self.unknown4)
+ r('FDST Index', 'fdst_idx')
+ a('FDST Count: %d'% self.fdst_count)
+ r('FCIS number', 'fcis_number')
+ a('FCIS count: %d'% self.fcis_count)
+ r('FLIS number', 'flis_number')
+ a('FLIS count: %d'% self.flis_count)
+ a('Unknown6: %r'% self.unknown6)
+ r('SRCS record index', 'srcs_record_index')
+ a('Number of SRCS records?: %d'%self.num_srcs_records)
+ a('Unknown7: %r'%self.unknown7)
+ a(('Extra data flags: %s (has multibyte: %s) '
'(has indexing: %s) (has uncrossable breaks: %s)')%(
bin(self.extra_data_flags), self.has_multibytes,
self.has_indexing_bytes, self.has_uncrossable_breaks ))
- ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX,
- self.primary_index_record))
- if self.file_version >= 8:
- i('Sections Index', self.sect_idx)
- i('SKEL Index', self.skel_idx)
- i('DATP Index', self.datp_idx)
- i('Other Index', self.oth_idx)
+ r('NCX index', 'primary_index_record')
+ if self.length >= 248:
+ r('Sections Index', 'sect_idx')
+ r('SKEL Index', 'skel_idx')
+ r('DATP Index', 'datp_idx')
+ r('Other Index', 'oth_idx')
if self.unknown9:
a('Unknown9: %r'%self.unknown9)
diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py
index 1af1611918..f005c8b24f 100644
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@@ -17,10 +17,12 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
File = namedtuple('File',
'file_number name divtbl_count start_position length')
-Elem = namedtuple('Elem',
+Elem = namedtuple('Chunk',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
+GuideRef = namedtuple('GuideRef', 'type title pos_fid')
+
def read_index(sections, idx, codec):
table, cncx = OrderedDict(), CNCX([], codec)
@@ -80,6 +82,9 @@ class Index(object):
def __str__(self):
return '\n'.join(self.render())
+ def __iter__(self):
+ return iter(self.records)
+
class SKELIndex(Index):
def __init__(self, skelidx, records, codec):
@@ -110,7 +115,7 @@ class SECTIndex(Index):
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
- raise ValueError('SECT Index has unknown tags: %s'%
+ raise ValueError('Chunk Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{2, 3, 4, 6}))
toc_text = self.cncx[tag_map[2][0]]
@@ -124,6 +129,28 @@ class SECTIndex(Index):
)
)
+class GuideIndex(Index):
+
+ def __init__(self, guideidx, records, codec):
+ super(GuideIndex, self).__init__(guideidx, records, codec)
+ self.records = []
+
+ if self.table is not None:
+ for i, text in enumerate(self.table.iterkeys()):
+ tag_map = self.table[text]
+ if set(tag_map.iterkeys()) not in ({1, 6}, {1, 2, 3}):
+ raise ValueError('Guide Index has unknown tags: %s'%
+ tag_map)
+
+ title = self.cncx[tag_map[1][0]]
+ self.records.append(GuideRef(
+ text,
+ title,
+ tag_map[6] if 6 in tag_map else (tag_map[2], tag_map[3])
+ )
+ )
+
+
class NCXIndex(Index):
def __init__(self, ncxidx, records, codec):
diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py
index 9dcc298742..40470ad2dd 100644
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@@ -10,9 +10,11 @@ __docformat__ = 'restructuredtext en'
import sys, os, imghdr, struct
from itertools import izip
+from calibre import CurrentDir
from calibre.ebooks.mobi.debug.headers import TextRecord
-from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex)
-from calibre.ebooks.mobi.utils import read_font_record
+from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
+ GuideIndex)
+from calibre.ebooks.mobi.utils import read_font_record, decode_tbs
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
@@ -43,6 +45,24 @@ class FDST(object):
return '\n'.join(ans)
+class File(object):
+
+ def __init__(self, skel, skeleton, text, first_aid, sections):
+ self.name = 'part%04d'%skel.file_number
+ self.skeleton, self.text, self.first_aid = skeleton, text, first_aid
+ self.sections = sections
+
+ def dump(self, ddir):
+ with open(os.path.join(ddir, self.name + '.html'), 'wb') as f:
+ f.write(self.text)
+ base = os.path.join(ddir, self.name + '-parts')
+ os.mkdir(base)
+ with CurrentDir(base):
+ with open('skeleton.html', 'wb') as f:
+ f.write(self.skeleton)
+ for i, text in enumerate(self.sections):
+ with open('sect-%04d.html'%i, 'wb') as f:
+ f.write(text)
class MOBIFile(object):
@@ -67,6 +87,8 @@ class MOBIFile(object):
self.extract_resources()
self.read_fdst()
self.read_indices()
+ self.build_files()
+ self.read_tbs()
def print_header(self, f=sys.stdout):
print (str(self.mf.palmdb).encode('utf-8'), file=f)
@@ -94,6 +116,37 @@ class MOBIFile(object):
self.header.encoding)
self.ncx_index = NCXIndex(self.header.primary_index_record,
self.mf.records, self.header.encoding)
+ self.guide_index = GuideIndex(self.header.oth_idx, self.mf.records,
+ self.header.encoding)
+
+ def build_files(self):
+ text = self.raw_text
+ self.files = []
+ for skel in self.skel_index.records:
+ sects = [x for x in self.sect_index.records if x.file_number
+ == skel.file_number]
+ skeleton = text[skel.start_position:skel.start_position+skel.length]
+ ftext = skeleton
+ first_aid = sects[0].toc_text
+ sections = []
+
+ for sect in sects:
+ start_pos = skel.start_position + skel.length + sect.start_pos
+ sect_text = text[start_pos:start_pos+sect.length]
+ insert_pos = sect.insert_pos - skel.start_position
+ ftext = ftext[:insert_pos] + sect_text + ftext[insert_pos:]
+ sections.append(sect_text)
+
+ self.files.append(File(skel, skeleton, ftext, first_aid, sections))
+
+ def dump_flows(self, ddir):
+ if self.fdst is None:
+ raise ValueError('This MOBI file has no FDST record')
+ for i, x in enumerate(self.fdst.sections):
+ start, end = x
+ raw = self.raw_text[start:end]
+ with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
+ f.write(raw)
def extract_resources(self):
self.resource_map = []
@@ -131,6 +184,54 @@ class MOBIFile(object):
self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext),
payload))
+ def read_tbs(self):
+ from calibre.ebooks.mobi.writer8.tbs import (Entry,
+ collect_indexing_data)
+ entry_map = []
+ for index in self.ncx_index:
+ enders = [e['pos'] for e in self.ncx_index if e['pos'] >
+ index['pos'] and
+ e['hlvl'] <= index['hlvl']]
+ end = min(enders+[len(self.raw_text)])
+
+ entry_map.append(Entry(index=index['num'], title=index['text'],
+ depth=index['hlvl'],
+ parent=index['parent'] if index['parent'] > -1 else None,
+ first_child=index['child1'] if index['child1'] > -1 else None,
+ last_child=index['childn'] if index['childn'] > -1 else None,
+ start=index['pos'], length=end-index['pos']))
+
+ indexing_data = collect_indexing_data(entry_map,
+ len(self.text_records))
+ self.indexing_data = []
+ for i, data in enumerate(indexing_data):
+ rec = self.text_records[i]
+ tbs_bytes = rec.trailing_data.get('indexing', b'')
+ desc = ['Record #%d'%i]
+ for x in ('starts', 'completes', 'ends', 'spans'):
+ points = ['\t%d at depth: %d'%(e.index, e.depth) for e in
+ getattr(data, x)]
+ if points:
+ desc.append(x+':')
+ desc.extend(points)
+ desc.append('TBS Bytes: ' + format_bytes(tbs_bytes))
+ flag_sz = 3
+ sequences = []
+ while tbs_bytes:
+ try:
+ val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz)
+ except:
+ break
+ flag_sz = 4
+ tbs_bytes = tbs_bytes[consumed:]
+ extra = {bin(k):v for k, v in extra.iteritems()}
+ sequences.append((val, extra))
+ for i, seq in enumerate(sequences):
+ desc.append('Sequence #%d: %r %r'%(i, seq[0], seq[1]))
+ if tbs_bytes:
+ desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes))
+ desc.append('')
+ self.indexing_data.append('\n'.join(desc))
def inspect_mobi(mobi_file, ddir):
f = MOBIFile(mobi_file)
@@ -141,7 +242,8 @@ def inspect_mobi(mobi_file, ddir):
with open(alltext, 'wb') as of:
of.write(f.raw_text)
- for x in ('text_records', 'images', 'fonts', 'binary'):
+ for x in ('text_records', 'images', 'fonts', 'binary', 'files', 'flows',
+ 'tbs'):
os.mkdir(os.path.join(ddir, x))
for rec in f.text_records:
@@ -158,9 +260,21 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
fo.write(str(f.skel_index).encode('utf-8'))
- with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
+ with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
fo.write(str(f.sect_index).encode('utf-8'))
with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
fo.write(str(f.ncx_index).encode('utf-8'))
+ with open(os.path.join(ddir, 'guide.record'), 'wb') as fo:
+ fo.write(str(f.guide_index).encode('utf-8'))
+
+ with open(os.path.join(ddir, 'tbs', 'all.txt'), 'wb') as fo:
+ fo.write(('\n'.join(f.indexing_data)).encode('utf-8'))
+
+ for part in f.files:
+ part.dump(os.path.join(ddir, 'files'))
+
+ f.dump_flows(os.path.join(ddir, 'flows'))
+
+
diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 4af7fdbf2c..d276689224 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -10,7 +10,7 @@ import copy
import re
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
-from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS, urlnormalize
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
from calibre.utils.magick.draw import identify_data
@@ -109,26 +109,8 @@ class MobiMLizer(object):
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
- self.remove_html_cover()
self.mobimlize_spine()
- def remove_html_cover(self):
- oeb = self.oeb
- if not oeb.metadata.cover \
- or 'cover' not in oeb.guide:
- return
- href = oeb.guide['cover'].href
- del oeb.guide['cover']
- item = oeb.manifest.hrefs[href]
- if item.spine_position is not None:
- self.log.warn('Found an HTML cover,', item.href, 'removing it.',
- 'If you find some content missing from the output MOBI, it '
- 'is because you misidentified the HTML cover in the input '
- 'document')
- oeb.spine.remove(item)
- if item.media_type in OEB_DOCS:
- self.oeb.manifest.remove(item)
-
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py
index f5add94eac..c732d8862e 100644
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@@ -114,6 +114,7 @@ class CNCX(object): # {{{
def __bool__(self):
return bool(self.records)
+ __nonzero__ = __bool__
def iteritems(self):
return self.records.iteritems()
diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py
index 8bb7f211f3..8a06bc346a 100644
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@@ -223,15 +223,15 @@ def insert_images_into_markup(parts, resource_map, log):
# Handle any embedded raster images links in the xhtml text
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
- img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
+ img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^')"]*[)'"]''')
+
+ style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''',
+ re.IGNORECASE)
+
for i in xrange(len(parts)):
part = parts[i]
- #[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
-
- # links to raster image files
- # image_pattern
srcpieces = img_pattern.split(part)
- for j in range(1, len(srcpieces), 2):
+ for j in xrange(1, len(srcpieces), 2):
tag = srcpieces[j]
if tag.startswith(']*>)''', re.IGNORECASE)
diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py
index bf068eb498..dcf2f998b2 100644
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@@ -109,7 +109,7 @@ class Mobi8Reader(object):
table, cncx = read_index(self.kf8_sections, self.header.othidx,
self.header.codec)
Item = namedtuple('Item',
- 'type title div_frag_num')
+ 'type title pos_fid')
for i, ref_type in enumerate(table.iterkeys()):
tag_map = table[ref_type]
@@ -119,7 +119,7 @@ class Mobi8Reader(object):
if 3 in tag_map.keys():
fileno = tag_map[3][0]
if 6 in tag_map.keys():
- fileno = tag_map[6][0]
+ fileno = tag_map[6]
self.guide.append(Item(ref_type.decode(self.header.codec),
title, fileno))
@@ -287,23 +287,24 @@ class Mobi8Reader(object):
def create_guide(self):
guide = Guide()
- for ref_type, ref_title, fileno in self.guide:
+ has_start = False
+ for ref_type, ref_title, pos_fid in self.guide:
try:
- elem = self.elems[fileno]
- except IndexError:
- # Happens for thumbnailstandard in Amazon book samples
- continue
- fi = self.get_file_info(elem.insert_pos)
- idtext = self.get_id_tag(elem.insert_pos).decode(self.header.codec)
- linktgt = fi.filename
+ if len(pos_fid) != 2:
+ continue
+ except TypeError:
+ continue # thumbnailstandard record, ignore it
+ linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid)
if idtext:
linktgt += b'#' + idtext
- g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwdu())
+ g = Guide.Reference(linktgt, os.getcwdu())
g.title, g.type = ref_title, ref_type
+ if g.title == 'start' or g.type == 'text':
+ has_start = True
guide.append(g)
so = self.header.exth.start_offset
- if so not in {None, NULL_INDEX}:
+ if so not in {None, NULL_INDEX} and not has_start:
fi = self.get_file_info(so)
if fi.filename is not None:
idtext = self.get_id_tag(so).decode(self.header.codec)
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 3530736ba0..ae8e583a1b 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -7,13 +7,15 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import struct, string, imghdr, zlib
+import struct, string, imghdr, zlib, os
from collections import OrderedDict
+from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
+RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0])
@@ -364,15 +366,17 @@ def count_set_bits(num):
num >>= 1
return ans
-def to_base(num, base=32):
+def to_base(num, base=32, min_num_digits=None):
digits = string.digits + string.ascii_uppercase
sign = 1 if num >= 0 else -1
- if num == 0: return '0'
+ if num == 0: return ('0' if min_num_digits is None else '0'*min_num_digits)
num *= sign
ans = []
while num:
ans.append(digits[(num % base)])
num //= base
+ if min_num_digits is not None and len(ans) < min_num_digits:
+ ans.extend('0'*(min_num_digits - len(ans)))
if sign < 0:
ans.append('-')
ans.reverse()
@@ -388,27 +392,8 @@ def mobify_image(data):
data = im.export('gif')
return data
-def read_zlib_header(header):
- header = bytearray(header)
- # See sec 2.2 of RFC 1950 for the zlib stream format
- # http://www.ietf.org/rfc/rfc1950.txt
- if (header[0]*256 + header[1])%31 != 0:
- return None, 'Bad zlib header, FCHECK failed'
-
- cmf = header[0] & 0b1111
- cinfo = header[0] >> 4
- if cmf != 8:
- return None, 'Unknown zlib compression method: %d'%cmf
- if cinfo > 7:
- return None, 'Invalid CINFO field in zlib header: %d'%cinfo
- fdict = (header[1]&0b10000)>>5
- if fdict != 0:
- return None, 'FDICT based zlib compression not supported'
- wbits = cinfo + 8
- return wbits, None
-
-
-def read_font_record(data, extent=1040): # {{{
+# Font records {{{
+def read_font_record(data, extent=1040):
'''
Return the font encoded in the MOBI FONT record represented by data.
The return value in a dict with fields raw_data, font_data, err, ext,
@@ -466,15 +451,8 @@ def read_font_record(data, extent=1040): # {{{
if flags & 0b1:
# ZLIB compressed data
- wbits, err = read_zlib_header(font_data[:2])
- if err is not None:
- ans['err'] = err
- return ans
- adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4)
try:
- # remove two bytes of zlib header and 4 bytes of trailing checksum
- # negative wbits indicates no standard gzip header
- font_data = zlib.decompress(font_data[2:-4], -wbits, usize)
+ font_data = zlib.decompress(font_data)
except Exception as e:
ans['err'] = 'Failed to zlib decompress font data (%s)'%e
return ans
@@ -483,23 +461,146 @@ def read_font_record(data, extent=1040): # {{{
ans['err'] = 'Uncompressed font size mismatch'
return ans
- if False:
- # For some reason these almost never match, probably Amazon has a
- # buggy Adler32 implementation
- sig = (zlib.adler32(font_data) & 0xffffffff)
- if sig != adler32:
- ans['err'] = ('Adler checksum did not match. Stored: %d '
- 'Calculated: %d')%(adler32, sig)
- return ans
-
ans['font_data'] = font_data
sig = font_data[:4]
ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
else 'otf' if sig == b'OTTO' else 'dat')
return ans
+
+def write_font_record(data, obfuscate=True, compress=True):
+ '''
+ Write the ttf/otf font represented by data into a font record. See
+ read_font_record() for details on the format of the record.
+ '''
+
+ flags = 0
+ key_len = 20
+ usize = len(data)
+ xor_key = b''
+ if compress:
+ flags |= 0b1
+ data = zlib.compress(data, 9)
+ if obfuscate:
+ flags |= 0b10
+ xor_key = os.urandom(key_len)
+ key = bytearray(xor_key)
+ data = bytearray(data)
+ for i in xrange(1040):
+ data[i] ^= key[i%key_len]
+ data = bytes(data)
+
+ key_start = struct.calcsize(b'>5L') + 4
+ data_start = key_start + len(xor_key)
+
+ header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
+ len(xor_key), key_start)
+
+ return header + xor_key + data
+
# }}}
+def create_text_record(text):
+ '''
+ Return a Palmdoc record of size RECORD_SIZE from the text file object.
+ In case the record ends in the middle of a multibyte character return
+ the overlap as well.
+ Returns data, overlap: where both are byte strings. overlap is the
+ extra bytes needed to complete the truncated multibyte character.
+ '''
+ opos = text.tell()
+ text.seek(0, 2)
+ # npos is the position of the next record
+ npos = min((opos + RECORD_SIZE, text.tell()))
+ # Number of bytes from the next record needed to complete the last
+ # character in this record
+ extra = 0
+ last = b''
+ while not last.decode('utf-8', 'ignore'):
+ # last contains no valid utf-8 characters
+ size = len(last) + 1
+ text.seek(npos - size)
+ last = text.read(size)
+
+ # last now has one valid utf-8 char and possibly some bytes that belong
+ # to a truncated char
+
+ try:
+ last.decode('utf-8', 'strict')
+ except UnicodeDecodeError:
+ # There are some truncated bytes in last
+ prev = len(last)
+ while True:
+ text.seek(npos - prev)
+ last = text.read(len(last) + 1)
+ try:
+ last.decode('utf-8')
+ except UnicodeDecodeError:
+ pass
+ else:
+ break
+ extra = len(last) - prev
+
+ text.seek(opos)
+ data = text.read(RECORD_SIZE)
+ overlap = text.read(extra)
+ text.seek(npos)
+
+ return data, overlap
+
+class CNCX(object): # {{{
+
+ '''
+ Create the CNCX records. These are records containing all the strings from
+ an index. Each record is of the form:
+ '''
+
+ MAX_STRING_LENGTH = 500
+
+ def __init__(self, strings=()):
+ self.strings = OrderedDict((s, 0) for s in strings)
+
+ self.records = []
+ offset = 0
+ buf = BytesIO()
+ for key in tuple(self.strings.iterkeys()):
+ utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+ l = len(utf8)
+ sz_bytes = encint(l)
+ raw = sz_bytes + utf8
+ if 0xfbf8 - buf.tell() < 6 + len(raw):
+ # Records in PDB files cannot be larger than 0x10000, so we
+ # stop well before that.
+ pad = 0xfbf8 - buf.tell()
+ buf.write(b'\0' * pad)
+ self.records.append(buf.getvalue())
+ buf.seek(0), buf.truncate(0)
+ offset = len(self.records) * 0x10000
+ buf.write(raw)
+ self.strings[key] = offset
+ offset += len(raw)
+
+ val = buf.getvalue()
+ if val:
+ self.records.append(align_block(val))
+
+ def __getitem__(self, string):
+ return self.strings[string]
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
+ def __len__(self):
+ return len(self.records)
+
+# }}}
+
+def is_guide_ref_start(ref):
+ return (ref.title.lower() == 'start' or
+ (ref.type and ref.type.lower() in {'start',
+ 'other.start', 'text'}))
diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py
index bc8dbbf7de..df3dcefb94 100644
--- a/src/calibre/ebooks/mobi/writer2/__init__.py
+++ b/src/calibre/ebooks/mobi/writer2/__init__.py
@@ -12,5 +12,4 @@ UNCOMPRESSED = 1
PALMDOC = 2
HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024
-RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index e349172d95..183697a1b4 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -12,56 +12,22 @@ from struct import pack
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
-from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
- encode_tbs, align_block, utf8_text)
+ encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
-class CNCX(object): # {{{
-
- '''
- Create the CNCX records. These are records containing all the strings from
- the NCX. Each record is of the form:
- '''
-
- MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
- self.strings = OrderedDict()
-
+ strings = []
for item in toc.iterdescendants(breadth_first=True):
- self.strings[item.title] = 0
+ strings.append(item.title)
if is_periodical:
- self.strings[item.klass] = 0
+ strings.append(item.klass)
if item.author:
- self.strings[item.author] = 0
+ strings.append(item.author)
if item.description:
- self.strings[item.description] = 0
-
- self.records = []
- offset = 0
- buf = StringIO()
- for key in tuple(self.strings.iterkeys()):
- utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
- l = len(utf8)
- sz_bytes = encint(l)
- raw = sz_bytes + utf8
- if 0xfbf8 - buf.tell() < 6 + len(raw):
- # Records in PDB files cannot be larger than 0x10000, so we
- # stop well before that.
- pad = 0xfbf8 - buf.tell()
- buf.write(b'\0' * pad)
- self.records.append(buf.getvalue())
- buf.truncate(0)
- offset = len(self.records) * 0x10000
- buf.write(raw)
- self.strings[key] = offset
- offset += len(raw)
-
- self.records.append(align_block(buf.getvalue()))
-
- def __getitem__(self, string):
- return self.strings[string]
+ strings.append(item.description)
+ CNCX_.__init__(self, strings)
# }}}
class TAGX(object): # {{{
@@ -534,14 +500,14 @@ class Indexer(object): # {{{
# Write offsets to index entries as an IDXT block
idxt_block = b'IDXT'
- buf.truncate(0)
+ buf.seek(0), buf.truncate(0)
for offset in offsets:
buf.write(pack(b'>H', header_length+offset))
idxt_block = align_block(idxt_block + buf.getvalue())
body = index_block + idxt_block
header = b'INDX'
- buf.truncate(0)
+ buf.seek(0), buf.truncate(0)
buf.write(pack(b'>I', header_length))
buf.write(b'\0'*4) # Unknown
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py
index 99321fab12..27c4838a4b 100644
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@@ -7,51 +7,31 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, random, time
+import random, time
from cStringIO import StringIO
from struct import pack
-from calibre.ebooks import normalize, generate_masthead
-from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+from calibre.ebooks import normalize
from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
-from calibre.ebooks.mobi.utils import (rescale_image, encint, mobify_image,
- encode_trailing_data, align_block, detect_periodical)
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
+from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
+ align_block, detect_periodical, RECORD_SIZE, create_text_record)
from calibre.ebooks.mobi.writer2.indexer import Indexer
-from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
-
-EXTH_CODES = {
- 'creator': 100,
- 'publisher': 101,
- 'description': 103,
- 'identifier': 104,
- 'subject': 105,
- 'pubdate': 106,
- 'review': 107,
- 'contributor': 108,
- 'rights': 109,
- 'type': 111,
- 'source': 112,
- 'versionnumber': 114,
- 'startreading': 116,
- 'coveroffset': 201,
- 'thumboffset': 202,
- 'hasfakecover': 203,
- 'lastupdatetime': 502,
- 'title': 503,
- }
# Disabled as I dont care about uncrossable breaks
WRITE_UNCROSSABLE_BREAKS = False
+NULL_INDEX = 0xffffffff
class MobiWriter(object):
- COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
- def __init__(self, opts, write_page_breaks_after_item=True):
+ def __init__(self, opts, resources, kf8, write_page_breaks_after_item=True):
self.opts = opts
+ self.resources = resources
+ self.kf8 = kf8
+ self.for_joint = kf8 is not None
self.write_page_breaks_after_item = write_page_breaks_after_item
self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
self.prefer_author_sort = opts.prefer_author_sort
@@ -83,7 +63,7 @@ class MobiWriter(object):
self.stream = stream
self.records = [None]
self.generate_content()
- self.generate_record0()
+ self.generate_joint_record0() if self.for_joint else self.generate_record0()
self.write_header()
self.write_content()
@@ -151,73 +131,19 @@ class MobiWriter(object):
# Images {{{
def generate_images(self):
- oeb = self.oeb
- oeb.logger.info('Serializing images...')
- self.image_records = []
- self.image_map = {}
- self.masthead_offset = 0
- index = 1
+ resources = self.resources
+ image_records = resources.records
+ self.image_map = resources.item_map
+ self.masthead_offset = resources.masthead_offset
+ self.cover_offset = resources.cover_offset
+ self.thumbnail_offset = resources.thumbnail_offset
- mh_href = None
- if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
- mh_href = oeb.guide['masthead'].href
- self.image_records.append(None)
- index += 1
- elif self.is_periodical:
- # Generate a default masthead
- data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
- self.image_records.append(data)
- index += 1
-
- cover_href = self.cover_offset = self.thumbnail_offset = None
- if (oeb.metadata.cover and
- unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
- cover_id = unicode(oeb.metadata.cover[0])
- item = oeb.manifest.ids[cover_id]
- cover_href = item.href
-
- for item in self.oeb.manifest.values():
- if item.media_type not in OEB_RASTER_IMAGES: continue
- try:
- data = item.data
- if self.opts.mobi_keep_original_images:
- data = mobify_image(data)
- else:
- data = rescale_image(data)
- except:
- oeb.logger.warn('Bad image file %r' % item.href)
- continue
- else:
- if mh_href and item.href == mh_href:
- self.image_records[0] = data
- continue
-
- self.image_records.append(data)
- self.image_map[item.href] = index
- index += 1
-
- if cover_href and item.href == cover_href:
- self.cover_offset = self.image_map[item.href] - 1
- try:
- data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
- maxsizeb=MAX_THUMB_SIZE)
- except:
- oeb.logger.warn('Failed to generate thumbnail')
- else:
- self.image_records.append(data)
- self.thumbnail_offset = index - 1
- index += 1
- finally:
- item.unload_data_from_memory()
-
- if self.image_records and self.image_records[0] is None:
+ if image_records and image_records[0] is None:
raise ValueError('Failed to find masthead image in manifest')
# }}}
- # Text {{{
-
- def generate_text(self):
+ def generate_text(self): # {{{
self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.image_map,
self.is_periodical,
@@ -232,7 +158,7 @@ class MobiWriter(object):
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
- data, overlap = self.read_text_record(text)
+ data, overlap = create_text_record(text)
if self.compression == PALMDOC:
data = compress_doc(data)
@@ -249,57 +175,6 @@ class MobiWriter(object):
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
-
- def read_text_record(self, text):
- '''
- Return a Palmdoc record of size RECORD_SIZE from the text file object.
- In case the record ends in the middle of a multibyte character return
- the overlap as well.
-
- Returns data, overlap: where both are byte strings. overlap is the
- extra bytes needed to complete the truncated multibyte character.
- '''
- opos = text.tell()
- text.seek(0, 2)
- # npos is the position of the next record
- npos = min((opos + RECORD_SIZE, text.tell()))
- # Number of bytes from the next record needed to complete the last
- # character in this record
- extra = 0
-
- last = b''
- while not last.decode('utf-8', 'ignore'):
- # last contains no valid utf-8 characters
- size = len(last) + 1
- text.seek(npos - size)
- last = text.read(size)
-
- # last now has one valid utf-8 char and possibly some bytes that belong
- # to a truncated char
-
- try:
- last.decode('utf-8', 'strict')
- except UnicodeDecodeError:
- # There are some truncated bytes in last
- prev = len(last)
- while True:
- text.seek(npos - prev)
- last = text.read(len(last) + 1)
- try:
- last.decode('utf-8')
- except UnicodeDecodeError:
- pass
- else:
- break
- extra = len(last) - prev
-
- text.seek(opos)
- data = text.read(RECORD_SIZE)
- overlap = text.read(extra)
- text.seek(npos)
-
- return data, overlap
-
# }}}
def generate_record0(self): # MOBI header {{{
@@ -315,11 +190,20 @@ class MobiWriter(object):
# header as well
bt = 0x103 if self.indexer.is_flat_periodical else 0x101
- exth = self.build_exth(bt)
+ from calibre.ebooks.mobi.writer8.exth import build_exth
+ exth = build_exth(metadata,
+ prefer_author_sort=self.opts.prefer_author_sort,
+ is_periodical=self.is_periodical,
+ share_not_sync=self.opts.share_not_sync,
+ cover_offset=self.cover_offset,
+ thumbnail_offset=self.thumbnail_offset,
+ start_offset=self.serializer.start_offset, mobi_doctype=bt
+ )
first_image_record = None
- if self.image_records:
+ if self.resources:
+ used_images = self.serializer.used_images
first_image_record = len(self.records)
- self.records.extend(self.image_records)
+ self.resources.serialize(self.records, used_images)
last_content_record = len(self.records) - 1
# FCIS/FLIS (Seems to serve no purpose)
@@ -481,125 +365,72 @@ class MobiWriter(object):
self.records[0] = align_block(record0)
# }}}
- def build_exth(self, mobi_doctype): # EXTH Header {{{
- oeb = self.oeb
- exth = StringIO()
- nrecs = 0
- for term in oeb.metadata:
- if term not in EXTH_CODES: continue
- code = EXTH_CODES[term]
- items = oeb.metadata[term]
- if term == 'creator':
- if self.prefer_author_sort:
- creators = [normalize(unicode(c.file_as or c)) for c in
- items][:1]
- else:
- creators = [normalize(unicode(c)) for c in items]
- items = ['; '.join(creators)]
- for item in items:
- data = normalize(unicode(item))
- if term != 'description':
- data = self.COLLAPSE_RE.sub(' ', data)
- if term == 'identifier':
- if data.lower().startswith('urn:isbn:'):
- data = data[9:]
- elif item.scheme.lower() == 'isbn':
- pass
- else:
- continue
- data = data.encode('utf-8')
- exth.write(pack(b'>II', code, len(data) + 8))
- exth.write(data)
- nrecs += 1
- if term == 'rights' :
- try:
- rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
- except:
- rights = b'Unknown'
- exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
- exth.write(rights)
- nrecs += 1
+ def generate_joint_record0(self): # {{{
+ from calibre.ebooks.mobi.writer8.mobi import (MOBIHeader,
+ HEADER_FIELDS)
+ from calibre.ebooks.mobi.writer8.exth import build_exth
- # Write UUID as ASIN
- uuid = None
- from calibre.ebooks.oeb.base import OPF
- for x in oeb.metadata['identifier']:
- if (x.get(OPF('scheme'), None).lower() == 'uuid' or
- unicode(x).startswith('urn:uuid:')):
- uuid = unicode(x).split(':')[-1]
- break
- if uuid is None:
- from uuid import uuid4
- uuid = str(uuid4())
+ # Insert resource records
+ first_image_record = None
+ old = len(self.records)
+ if self.resources:
+ used_images = self.serializer.used_images | self.kf8.used_images
+ first_image_record = len(self.records)
+ self.resources.serialize(self.records, used_images)
+ resource_record_count = len(self.records) - old
- if isinstance(uuid, unicode):
- uuid = uuid.encode('utf-8')
- if not self.opts.share_not_sync:
- exth.write(pack(b'>II', 113, len(uuid) + 8))
- exth.write(uuid)
- nrecs += 1
+ # Insert KF8 records
+ self.records.append(b'BOUNDARY')
+ kf8_header_index = len(self.records)
+ self.kf8.start_offset = (self.serializer.start_offset,
+ self.kf8.start_offset)
+ self.records.append(self.kf8.record0)
+ self.records.extend(self.kf8.records[1:])
- # Write cdetype
- if not self.is_periodical:
- if not self.opts.share_not_sync:
- exth.write(pack(b'>II', 501, 12))
- exth.write(b'EBOK')
- nrecs += 1
- else:
- ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
- if ids:
- exth.write(pack(b'>II', 501, 12))
- exth.write(ids)
- nrecs += 1
+ first_image_record = (first_image_record if first_image_record else
+ len(self.records))
- # Add a publication date entry
- if oeb.metadata['date']:
- datestr = str(oeb.metadata['date'][0])
- elif oeb.metadata['timestamp']:
- datestr = str(oeb.metadata['timestamp'][0])
+ header_fields = {k:getattr(self.kf8, k) for k in HEADER_FIELDS}
- if datestr is None:
- raise ValueError("missing date or timestamp")
+ # Now change the header fields that need to be different in the MOBI 6
+ # header
+ header_fields['first_resource_record'] = first_image_record
+ header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
+ header_fields['fdst_record'] = NULL_INDEX
+ header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
+ extra_data_flags = 0b1 # Has multibyte overlap bytes
+ if self.primary_index_record_idx is not None:
+ extra_data_flags |= 0b10
+ header_fields['extra_data_flags'] = extra_data_flags
- datestr = bytes(datestr)
- exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
- exth.write(datestr)
- nrecs += 1
- if self.is_periodical:
- exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
- exth.write(datestr)
- nrecs += 1
+ for k, v in {'last_text_record':'last_text_record_idx',
+ 'first_non_text_record':'first_non_text_record_idx',
+ 'ncx_index':'primary_index_record_idx',
+ }.iteritems():
+ header_fields[k] = getattr(self, v)
+ if header_fields['ncx_index'] is None:
+ header_fields['ncx_index'] = NULL_INDEX
- if self.is_periodical:
- # Pretend to be amazon's super secret periodical generator
- vals = {204:201, 205:2, 206:0, 207:101}
- else:
- # Pretend to be kindlegen 1.2
- vals = {204:201, 205:1, 206:2, 207:33307}
- for code, val in vals.iteritems():
- exth.write(pack(b'>III', code, 12, val))
- nrecs += 1
+ for x in ('skel', 'chunk', 'guide'):
+ header_fields[x+'_index'] = NULL_INDEX
- if self.cover_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
- self.cover_offset))
- exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
- nrecs += 2
- if self.thumbnail_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
- self.thumbnail_offset))
- nrecs += 1
+ # Create the MOBI 6 EXTH
+ opts = self.opts
+ kuc = 0 if resource_record_count > 0 else None
- if self.serializer.start_offset is not None:
- exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
- self.serializer.start_offset))
- nrecs += 1
+ header_fields['exth'] = build_exth(self.oeb.metadata,
+ prefer_author_sort=opts.prefer_author_sort,
+ is_periodical=opts.mobi_periodical,
+ share_not_sync=opts.share_not_sync,
+ cover_offset=self.cover_offset,
+ thumbnail_offset=self.thumbnail_offset,
+ num_of_resources=resource_record_count,
+ kf8_unknown_count=kuc, be_kindlegen2=True,
+ kf8_header_index=kf8_header_index,
+ start_offset=self.serializer.start_offset,
+ mobi_doctype=2)
+ self.records[0] = MOBIHeader(file_version=6)(**header_fields)
- exth = exth.getvalue()
- trail = len(exth) % 4
- pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
- exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
- return b''.join(exth)
# }}}
def write_header(self): # PalmDB header {{{
diff --git a/src/calibre/ebooks/mobi/writer2/resources.py b/src/calibre/ebooks/mobi/writer2/resources.py
new file mode 100644
index 0000000000..2fcb93790c
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer2/resources.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import imghdr
+
+from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
+from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
+ write_font_record)
+from calibre.ebooks import generate_masthead
+from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+
+PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'
+
+class Resources(object):
+
+ def __init__(self, oeb, opts, is_periodical, add_fonts=False):
+ self.oeb, self.log, self.opts = oeb, oeb.log, opts
+ self.is_periodical = is_periodical
+
+ self.item_map = {}
+ self.records = []
+ self.mime_map = {}
+ self.masthead_offset = 0
+ self.used_image_indices = set()
+ self.image_indices = set()
+ self.cover_offset = self.thumbnail_offset = None
+
+ self.add_resources(add_fonts)
+
+ def process_image(self, data):
+ return (mobify_image(data) if self.opts.mobi_keep_original_images else
+ rescale_image(data))
+
+ def add_resources(self, add_fonts):
+ oeb = self.oeb
+ oeb.logger.info('Serializing resources...')
+ index = 1
+
+ mh_href = None
+ if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
+ mh_href = oeb.guide['masthead'].href
+ self.records.append(None)
+ index += 1
+ self.used_image_indices.add(0)
+ self.image_indices.add(0)
+ elif self.is_periodical:
+ # Generate a default masthead
+ data = generate_masthead(unicode(self.oeb.metadata['title'][0]))
+ self.records.append(data)
+ self.used_image_indices.add(0)
+ self.image_indices.add(0)
+ index += 1
+
+ cover_href = self.cover_offset = self.thumbnail_offset = None
+ if (oeb.metadata.cover and
+ unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
+ cover_id = unicode(oeb.metadata.cover[0])
+ item = oeb.manifest.ids[cover_id]
+ cover_href = item.href
+
+ for item in self.oeb.manifest.values():
+ if item.media_type not in OEB_RASTER_IMAGES: continue
+ try:
+ data = self.process_image(item.data)
+ except:
+ self.log.warn('Bad image file %r' % item.href)
+ continue
+ else:
+ if mh_href and item.href == mh_href:
+ self.records[0] = data
+ continue
+
+ self.image_indices.add(len(self.records))
+ self.records.append(data)
+ self.item_map[item.href] = index
+ self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
+ index += 1
+
+ if cover_href and item.href == cover_href:
+ self.cover_offset = self.item_map[item.href] - 1
+ self.used_image_indices.add(self.cover_offset)
+ try:
+ data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
+ maxsizeb=MAX_THUMB_SIZE)
+ except:
+ self.log.warn('Failed to generate thumbnail')
+ else:
+ self.image_indices.add(len(self.records))
+ self.records.append(data)
+ self.thumbnail_offset = index - 1
+ self.used_image_indices.add(self.thumbnail_offset)
+ index += 1
+ finally:
+ item.unload_data_from_memory()
+
+ if add_fonts:
+ for item in self.oeb.manifest.values():
+ if item.href and item.href.rpartition('.')[-1].lower() in {
+ 'ttf', 'otf'} and isinstance(item.data, bytes):
+ self.records.append(write_font_record(item.data))
+ self.item_map[item.href] = len(self.records)
+
+ def add_extra_images(self):
+ '''
+ Add any images that were created after the call to add_resources()
+ '''
+ for item in self.oeb.manifest.values():
+ if (item.media_type not in OEB_RASTER_IMAGES or item.href in
+ self.item_map): continue
+ try:
+ data = self.process_image(item.data)
+ except:
+ self.log.warn('Bad image file %r' % item.href)
+ else:
+ self.records.append(data)
+ self.item_map[item.href] = len(self.records)
+ finally:
+ item.unload_data_from_memory()
+
+ def serialize(self, records, used_images):
+ used_image_indices = self.used_image_indices | {
+ v-1 for k, v in self.item_map.iteritems() if k in used_images}
+ for i in self.image_indices-used_image_indices:
+ self.records[i] = PLACEHOLDER_GIF
+ records.extend(self.records)
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index b35f33439b..2dda657a93 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -12,6 +12,7 @@ import re
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start
from collections import defaultdict
from urlparse import urldefrag
@@ -39,6 +40,7 @@ class Serializer(object):
self.oeb = oeb
# Map of image hrefs to image index in the MOBI file
self.images = images
+ self.used_images = set()
self.logger = oeb.logger
self.is_periodical = is_periodical
self.write_page_breaks_after_item = write_page_breaks_after_item
@@ -160,9 +162,7 @@ class Serializer(object):
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
- if (ref.title.lower() == 'start' or
- (ref.type and ref.type.lower() in {'start',
- 'other.start', 'text'})):
+ if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
@@ -329,6 +329,7 @@ class Serializer(object):
href = urlnormalize(item.abshref(val))
if href in self.images:
index = self.images[href]
+ self.used_images.add(href)
buf.write(b'recindex="%05d"' % index)
continue
buf.write(attr.encode('utf-8'))
diff --git a/src/calibre/ebooks/mobi/writer8/exth.py b/src/calibre/ebooks/mobi/writer8/exth.py
new file mode 100644
index 0000000000..361b978528
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/exth.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import re
+from struct import pack
+from io import BytesIO
+
+from calibre.ebooks.mobi.utils import utf8_text
+
+EXTH_CODES = {
+ 'creator': 100,
+ 'publisher': 101,
+ 'description': 103,
+ 'identifier': 104,
+ 'subject': 105,
+ 'pubdate': 106,
+ 'review': 107,
+ 'contributor': 108,
+ 'rights': 109,
+ 'type': 111,
+ 'source': 112,
+ 'versionnumber': 114,
+ 'startreading': 116,
+ 'kf8_header_index': 121,
+ 'num_of_resources': 125,
+ 'kf8_unknown_count': 131,
+ 'coveroffset': 201,
+ 'thumboffset': 202,
+ 'hasfakecover': 203,
+ 'lastupdatetime': 502,
+ 'title': 503,
+}
+
+COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
+
+def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
+ share_not_sync=True, cover_offset=None, thumbnail_offset=None,
+ start_offset=None, mobi_doctype=2, num_of_resources=None,
+ kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None):
+ exth = BytesIO()
+ nrecs = 0
+
+ for term in metadata:
+ if term not in EXTH_CODES: continue
+ code = EXTH_CODES[term]
+ items = metadata[term]
+ if term == 'creator':
+ if prefer_author_sort:
+ creators = [unicode(c.file_as or c) for c in
+ items][:1]
+ else:
+ creators = [unicode(c) for c in items]
+ items = ['; '.join(creators)]
+ for item in items:
+ data = unicode(item)
+ if term != 'description':
+ data = COLLAPSE_RE.sub(' ', data)
+ if term == 'identifier':
+ if data.lower().startswith('urn:isbn:'):
+ data = data[9:]
+ elif item.scheme.lower() == 'isbn':
+ pass
+ else:
+ continue
+ data = utf8_text(data)
+ exth.write(pack(b'>II', code, len(data) + 8))
+ exth.write(data)
+ nrecs += 1
+ if term == 'rights' :
+ try:
+ rights = utf8_text(unicode(metadata.rights[0]))
+ except:
+ rights = b'Unknown'
+ exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8))
+ exth.write(rights)
+ nrecs += 1
+
+ # Write UUID as ASIN
+ uuid = None
+ from calibre.ebooks.oeb.base import OPF
+ for x in metadata['identifier']:
+ if (x.get(OPF('scheme'), None).lower() == 'uuid' or
+ unicode(x).startswith('urn:uuid:')):
+ uuid = unicode(x).split(':')[-1]
+ break
+ if uuid is None:
+ from uuid import uuid4
+ uuid = str(uuid4())
+
+ if isinstance(uuid, unicode):
+ uuid = uuid.encode('utf-8')
+ if not share_not_sync:
+ exth.write(pack(b'>II', 113, len(uuid) + 8))
+ exth.write(uuid)
+ nrecs += 1
+
+ # Write cdetype
+ if not is_periodical:
+ if not share_not_sync:
+ exth.write(pack(b'>II', 501, 12))
+ exth.write(b'EBOK')
+ nrecs += 1
+ else:
+ ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
+ if ids:
+ exth.write(pack(b'>II', 501, 12))
+ exth.write(ids)
+ nrecs += 1
+
+ # Add a publication date entry
+ if metadata['date']:
+ datestr = str(metadata['date'][0])
+ elif metadata['timestamp']:
+ datestr = str(metadata['timestamp'][0])
+
+ if datestr is None:
+ raise ValueError("missing date or timestamp")
+
+ datestr = bytes(datestr)
+ exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
+ exth.write(datestr)
+ nrecs += 1
+ if is_periodical:
+ exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8))
+ exth.write(datestr)
+ nrecs += 1
+
+ if be_kindlegen2:
+ vals = {204:201, 205:2, 206:2, 207:35621}
+ elif is_periodical:
+ # Pretend to be amazon's super secret periodical generator
+ vals = {204:201, 205:2, 206:0, 207:101}
+ else:
+ # Pretend to be kindlegen 1.2
+ vals = {204:201, 205:1, 206:2, 207:33307}
+ for code, val in vals.iteritems():
+ exth.write(pack(b'>III', code, 12, val))
+ nrecs += 1
+
+ if cover_offset is not None:
+ exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12,
+ cover_offset))
+ exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0))
+ nrecs += 2
+ if thumbnail_offset is not None:
+ exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
+ thumbnail_offset))
+ nrecs += 1
+
+ if start_offset is not None:
+ try:
+ len(start_offset)
+ except TypeError:
+ start_offset = [start_offset]
+ for so in start_offset:
+ if so is not None:
+ exth.write(pack(b'>III', EXTH_CODES['startreading'], 12,
+ so))
+ nrecs += 1
+
+ if kf8_header_index is not None:
+ exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12,
+ kf8_header_index))
+ nrecs += 1
+
+ if num_of_resources is not None:
+ exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12,
+ num_of_resources))
+ nrecs += 1
+
+ if kf8_unknown_count is not None:
+ exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12,
+ kf8_unknown_count))
+ nrecs += 1
+
+ exth = exth.getvalue()
+ trail = len(exth) % 4
+ pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte
+ exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad]
+ return b''.join(exth)
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py
new file mode 100644
index 0000000000..94ae722f59
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/header.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import random
+from io import BytesIO
+from collections import OrderedDict
+from struct import pack
+
+from calibre.ebooks.mobi.utils import align_block
+
+NULL = 0xffffffff
+zeroes = lambda x: b'\0'*x
+nulls = lambda x: b'\xff'*x
+short = lambda x: pack(b'>H', x)
+
+class Header(OrderedDict):
+
+ HEADER_NAME = b''
+
+ DEFINITION = '''
+ '''
+
+ ALIGN_BLOCK = False
+ POSITIONS = {} # Mapping of position field to field whose position should
+ # be stored in the position field
+ SHORT_FIELDS = set()
+
+ def __init__(self):
+ OrderedDict.__init__(self)
+
+ for line in self.DEFINITION.splitlines():
+ line = line.strip()
+ if not line or line.startswith('#'): continue
+ name, val = [x.strip() for x in line.partition('=')[0::2]]
+ if val:
+ val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
+ 'nulls':nulls, 'short':short, 'random':random})
+ else:
+ val = 0
+ if name in self:
+ raise ValueError('Duplicate field in definition: %r'%name)
+ self[name] = val
+
+ @property
+ def dynamic_fields(self):
+ return tuple(k for k, v in self.iteritems() if v is None)
+
+ def __call__(self, **kwargs):
+ positions = {}
+ for name, val in kwargs.iteritems():
+ if name not in self:
+ raise KeyError('Not a valid header field: %r'%name)
+ self[name] = val
+
+ buf = BytesIO()
+ buf.write(bytes(self.HEADER_NAME))
+ for name, val in self.iteritems():
+ val = self.format_value(name, val)
+ positions[name] = buf.tell()
+ if val is None:
+ raise ValueError('Dynamic field %r not set'%name)
+ if isinstance(val, (int, long)):
+ fmt = 'H' if name in self.SHORT_FIELDS else 'I'
+ val = pack(b'>'+fmt, val)
+ buf.write(val)
+
+ for pos_field, field in self.POSITIONS.iteritems():
+ buf.seek(positions[pos_field])
+ buf.write(pack(b'>I', positions[field]))
+
+ ans = buf.getvalue()
+ if self.ALIGN_BLOCK:
+ ans = align_block(ans)
+ return ans
+
+
+ def format_value(self, name, val):
+ return val
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
new file mode 100644
index 0000000000..c37afb81ff
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+from future_builtins import map
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+from io import BytesIO
+
+from calibre.ebooks.mobi.utils import CNCX, encint, align_block
+from calibre.ebooks.mobi.writer8.header import Header
+
+TagMeta_ = namedtuple('TagMeta',
+ 'name number values_per_entry bitmask end_flag')
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
+
+# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
+# could also be extended to 4 bit wide ones as well
+mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
+ 128:7, 192: 6 }
+
+class IndexHeader(Header): # {{{
+
+ HEADER_NAME = b'INDX'
+ ALIGN_BLOCK = True
+ HEADER_LENGTH = 192
+
+ DEFINITION = '''
+ # 4 - 8: Header Length
+ header_length = {header_length}
+
+ # 8 - 16: Unknown
+ unknown1 = zeroes(8)
+
+ # 16 - 20: Index type: 0 - normal 2 - inflection
+ type = 2
+
+ # 20 - 24: IDXT offset (filled in later)
+ idxt_offset
+
+ # 24 - 28: Number of index records
+ num_of_records = 1
+
+ # 28 - 32: Index encoding (65001 = utf-8)
+ encoding = 65001
+
+ # 32 - 36: Unknown
+ unknown2 = NULL
+
+ # 36 - 40: Number of Index entries
+ num_of_entries = DYN
+
+ # 40 - 44: ORDT offset
+ ordt_offset
+
+ # 44 - 48: LIGT offset
+ ligt_offset
+
+ # 48 - 52: Number of ORDT/LIGT? entries
+ num_of_ordt_entries
+
+ # 52 - 56: Number of CNCX records
+ num_of_cncx = DYN
+
+ # 56 - 180: Unknown
+ unknown3 = zeroes(124)
+
+ # 180 - 184: TAGX offset
+ tagx_offset = {header_length}
+
+ # 184 - 192: Unknown
+ unknown4 = zeroes(8)
+
+ # TAGX
+ tagx = DYN
+
+ # Last Index entry
+ last_index = DYN
+
+ # IDXT
+ idxt = DYN
+ '''.format(header_length=HEADER_LENGTH)
+
+ POSITIONS = {'idxt_offset':'idxt'}
+# }}}
+
+class Index(object): # {{{
+
+ control_byte_count = 1
+ cncx = CNCX()
+ tag_types = (EndTagTable,)
+
+ HEADER_LENGTH = IndexHeader.HEADER_LENGTH
+
+ @classmethod
+ def generate_tagx(cls):
+ header = b'TAGX'
+ byts = bytearray()
+ for tag_meta in cls.tag_types:
+ byts.extend(tag_meta[1:])
+ # table length, control byte count
+ header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+ return header + bytes(byts)
+
+ @classmethod
+ def calculate_control_bytes_for_each_entry(cls, entries):
+ control_bytes = []
+ for lead_text, tags in entries:
+ cbs = []
+ ans = 0
+ for (name, number, vpe, mask, endi) in cls.tag_types:
+ if endi == 1:
+ cbs.append(ans)
+ ans = 0
+ continue
+ try:
+ nvals = len(tags.get(name, ()))
+ except TypeError:
+ nvals = 1
+ nentries = nvals // vpe
+ shifts = mask_to_bit_shifts[mask]
+ ans |= mask & (nentries << shifts)
+ if len(cbs) != cls.control_byte_count:
+ raise ValueError('The entry %r is invalid'%[lead_text, tags])
+ control_bytes.append(cbs)
+ return control_bytes
+
+ def __call__(self):
+ self.control_bytes = self.calculate_control_bytes_for_each_entry(
+ self.entries)
+
+ rendered_entries = []
+ index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
+ IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+ last_lead_text = b''
+ too_large = ValueError('Index has too many entries, calibre does not'
+ ' support generating multiple index records at this'
+ ' time.')
+
+ for i, x in enumerate(self.entries):
+ control_bytes = self.control_bytes[i]
+ leading_text, tags = x
+ buf.seek(0), buf.truncate(0)
+ leading_text = (leading_text.encode('utf-8') if
+ isinstance(leading_text, unicode) else leading_text)
+ raw = bytearray(leading_text)
+ raw.insert(0, len(leading_text))
+ buf.write(bytes(raw))
+ buf.write(bytes(bytearray(control_bytes)))
+ for tag in self.tag_types:
+ values = tags.get(tag.name, None)
+ if values is None: continue
+ try:
+ len(values)
+ except TypeError:
+ values = [values]
+ if values:
+ for val in values:
+ try:
+ buf.write(encint(val))
+ except ValueError:
+ raise ValueError('Invalid values for %r: %r'%(
+ tag, values))
+ raw = buf.getvalue()
+ offset = index.tell()
+ if offset + self.HEADER_LENGTH >= 0x10000:
+ raise too_large
+ rendered_entries.append(IndexEntry(offset, len(raw), raw))
+ idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
+ index.write(raw)
+ last_lead_text = leading_text
+
+ index_block = align_block(index.getvalue())
+ idxt_block = align_block(b'IDXT' + idxt.getvalue())
+ body = index_block + idxt_block
+ if len(body) + self.HEADER_LENGTH >= 0x10000:
+ raise too_large
+ header = b'INDX'
+ buf.seek(0), buf.truncate(0)
+ buf.write(pack(b'>I', self.HEADER_LENGTH))
+ buf.write(b'\0'*4) # Unknown
+ buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+ buf.write(b'\0'*4) # Unknown
+
+ # IDXT block offset
+ buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
+
+ # Number of index entries
+ buf.write(pack(b'>I', len(rendered_entries)))
+
+ buf.write(b'\xff'*8) # Unknown
+
+ buf.write(b'\0'*156) # Unknown
+
+ header += buf.getvalue()
+ index_record = header + body
+
+ tagx = self.generate_tagx()
+ idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
+ b'\0')
+ # Last index
+ idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+ idx += pack(b'>H', len(rendered_entries))
+
+ header = {
+ 'num_of_entries': len(rendered_entries),
+ 'num_of_cncx': len(self.cncx),
+ 'tagx':tagx,
+ 'last_index':align_block(idx),
+ 'idxt':idxt
+ }
+ header = IndexHeader()(**header)
+ self.records = [header, index_record]
+ self.records.extend(self.cncx.records)
+ return self.records
+# }}}
+
+class SkelIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('chunk_count', 1, 1, 3, 0),
+ ('geometry', 6, 2, 12, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, skel_table):
+ self.entries = [
+ (s.name, {
+ # Dont ask me why these entries have to be repeated twice
+ 'chunk_count':(s.chunk_count, s.chunk_count),
+ 'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+ }) for s in skel_table
+ ]
+
+
+class ChunkIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('cncx_offset', 2, 1, 1, 0),
+ ('file_number', 3, 1, 2, 0),
+ ('sequence_number', 4, 1, 4, 0),
+ ('geometry', 6, 2, 8, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, chunk_table):
+ self.cncx = CNCX(c.selector for c in chunk_table)
+
+ self.entries = [
+ ('%010d'%c.insert_pos, {
+
+ 'cncx_offset':self.cncx[c.selector],
+ 'file_number':c.file_number,
+ 'sequence_number':c.sequence_number,
+ 'geometry':(c.start_pos, c.length),
+ }) for c in chunk_table
+ ]
+
+class GuideIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('title', 1, 1, 1, 0),
+ ('pos_fid', 6, 2, 2, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, guide_table):
+ self.cncx = CNCX(c.title for c in guide_table)
+
+ self.entries = [
+ (r.type, {
+
+ 'title':self.cncx[r.title],
+ 'pos_fid':r.pos_fid,
+ }) for r in guide_table
+ ]
+
+
+class NCXIndex(Index):
+
+ ''' The commented out parts have been seen in NCX indexes from MOBI 6
+ periodicals. Since we have no MOBI 8 periodicals to reverse engineer, leave
+ it for now. '''
+ # control_byte_count = 2
+ tag_types = tuple(map(TagMeta, (
+ ('offset', 1, 1, 1, 0),
+ ('length', 2, 1, 2, 0),
+ ('label', 3, 1, 4, 0),
+ ('depth', 4, 1, 8, 0),
+ ('parent', 21, 1, 16, 0),
+ ('first_child', 22, 1, 32, 0),
+ ('last_child', 23, 1, 64, 0),
+ ('pos_fid', 6, 2, 128, 0),
+ EndTagTable,
+ # ('image', 69, 1, 1, 0),
+ # ('description', 70, 1, 2, 0),
+ # ('author', 71, 1, 4, 0),
+ # ('caption', 72, 1, 8, 0),
+ # ('attribution', 73, 1, 16, 0),
+ # EndTagTable
+ )))
+
+ def __init__(self, toc_table):
+ strings = []
+ for entry in toc_table:
+ strings.append(entry['label'])
+ aut = entry.get('author', None)
+ if aut:
+ strings.append(aut)
+ desc = entry.get('description', None)
+ if desc:
+ strings.append(desc)
+ self.cncx = CNCX(strings)
+
+ def to_entry(x):
+ ans = {}
+ for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+ 'first_child', 'last_child'):
+ if f in x:
+ ans[f] = x[f]
+ for f in ('label', 'description', 'author'):
+ if f in x:
+ ans[f] = self.cncx[x[f]]
+ return ('%02x'%x['index'], ans)
+
+ self.entries = list(map(to_entry, toc_table))
+
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index fc4234eb10..97ed31a2e3 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -7,9 +7,400 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import copy, logging
+from functools import partial
+from collections import defaultdict, namedtuple
+from io import BytesIO
+from struct import pack
+
+import cssutils
+from lxml import etree
+
+from calibre import isbytestring, force_unicode
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+ is_guide_ref_start)
+from calibre.ebooks.compression.palmdoc import compress_doc
+from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
+ extract, XHTML, urlnormalize)
+from calibre.ebooks.oeb.parse_utils import barename
+from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+ ChunkIndex, GuideIndex)
+from calibre.ebooks.mobi.writer8.mobi import KF8Book
+from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
+from calibre.ebooks.mobi.writer8.toc import TOCAdder
+
+XML_DOCS = OEB_DOCS | {SVG_MIME}
+
+# References to record numbers in KF8 are stored as base-32 encoded integers,
+# with 4 digits
+to_ref = partial(to_base, base=32, min_num_digits=4)
class KF8Writer(object):
- def __init__(self, oeb, opts):
+ def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
+ self.compress = not self.opts.dont_compress
+ self.has_tbs = False
+ self.log.info('Creating KF8 output')
+
+ # Create an inline ToC if one does not already exist
+ self.toc_adder = TOCAdder(oeb, opts)
+ self.used_images = set()
+ self.resources = resources
+ self.flows = [None] # First flow item is reserved for the text
+ self.records = [None] # Placeholder for zeroth record
+
+ self.log('\tGenerating KF8 markup...')
+ self.dup_data()
+ self.replace_resource_links()
+ self.extract_css_into_flows()
+ self.extract_svg_into_flows()
+ self.replace_internal_links_with_placeholders()
+ self.insert_aid_attributes()
+ self.chunk_it_up()
+ # Dump the cloned data as it is no longer needed
+ del self._data_cache
+ self.create_text_records()
+ self.log('\tCreating indices...')
+ self.create_fdst_records()
+ self.create_indices()
+ self.create_guide()
+ # We do not want to use this ToC for MOBI 6, so remove it
+ self.toc_adder.remove_generated_toc()
+
+ def dup_data(self):
+ ''' Duplicate data so that any changes we make to markup/CSS only
+ affect KF8 output and not MOBI 6 output '''
+ self._data_cache = {}
+ # Suppress cssutils logging output as it is duplicated anyway earlier
+ # in the pipeline
+ cssutils.log.setLevel(logging.CRITICAL)
+ for item in self.oeb.manifest:
+ if item.media_type in XML_DOCS:
+ self._data_cache[item.href] = copy.deepcopy(item.data)
+ elif item.media_type in OEB_STYLES:
+ # I can't figure out how to make an efficient copy of the
+ # in-memory CSSStylesheet, as deepcopy doesn't work (raises an
+ # exception)
+ self._data_cache[item.href] = cssutils.parseString(
+ item.data.cssText, validate=False)
+
+ def data(self, item):
+ return self._data_cache.get(item.href, item.data)
+
+ def replace_resource_links(self):
+ ''' Replace links to resources (raster images/fonts) with pointers to
+ the MOBI record containing the resource. The pointers are of the form:
+ kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
+ not used for fonts. '''
+
+ def pointer(item, oref):
+ ref = item.abshref(oref)
+ idx = self.resources.item_map.get(ref, None)
+ if idx is not None:
+ is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
+ idx = to_ref(idx)
+ if is_image:
+ self.used_images.add(ref)
+ return 'kindle:embed:%s?mime=%s'%(idx,
+ self.resources.mime_map[ref])
+ else:
+ return 'kindle:embed:%s'%idx
+ return oref
+
+ for item in self.oeb.manifest:
+
+ if item.media_type in XML_DOCS:
+ root = self.data(item)
+ for tag in XPath('//h:img|//svg:image')(root):
+ for attr, ref in tag.attrib.iteritems():
+ if attr.split('}')[-1].lower() in {'src', 'href'}:
+ tag.attrib[attr] = pointer(item, ref)
+
+ for tag in XPath('//h:style')(root):
+ if tag.text:
+ sheet = cssutils.parseString(tag.text, validate=False)
+ replacer = partial(pointer, item)
+ cssutils.replaceUrls(sheet, replacer,
+ ignoreImportRules=True)
+ repl = sheet.cssText
+ if isbytestring(repl):
+ repl = repl.decode('utf-8')
+ tag.text = '\n'+ repl + '\n'
+
+ elif item.media_type in OEB_STYLES:
+ sheet = self.data(item)
+ replacer = partial(pointer, item)
+ cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
+
+ def extract_css_into_flows(self):
+ inlines = defaultdict(list) # Ensure identical
+
+
+{title}
+
+
+