diff --git a/recipes/idg_se.recipe b/recipes/idg_se.recipe index e5f0203e09..155c6647d3 100644 --- a/recipes/idg_se.recipe +++ b/recipes/idg_se.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class IDGse(BasicNewsRecipe): title = 'IDG' - __author__ = 'zapt0' + __author__ = 'Stanislav Khromov' language = 'sv' description = 'IDG.se' oldest_article = 1 @@ -15,6 +15,9 @@ class IDGse(BasicNewsRecipe): feeds = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')] + def get_article_url(self, article): + return article.get('guid', None) + def print_version(self,url): return url + '?articleRenderMode=print&m=print' diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe index e3eee9dab1..3cdf6e5acc 100644 --- a/recipes/microwave_and_rf.recipe +++ b/recipes/microwave_and_rf.recipe @@ -15,7 +15,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.magick import Image -class Microwave_and_RF(BasicNewsRecipe): +class Microwaves_and_RF(BasicNewsRecipe): Convert_Grayscale = False # Convert images to gray scale or not @@ -25,9 +25,9 @@ class Microwave_and_RF(BasicNewsRecipe): # Add sections that want to be included from the magazine include_sections = [] - title = u'Microwave and RF' - __author__ = 'kiavash' - description = u'Microwave and RF Montly Magazine' + title = u'Microwaves and RF' + __author__ = u'kiavash' + description = u'Microwaves and RF Montly Magazine' publisher = 'Penton Media, Inc.' publication_type = 'magazine' site = 'http://mwrf.com' @@ -96,9 +96,16 @@ class Microwave_and_RF(BasicNewsRecipe): def parse_index(self): - # Fetches the main page of Microwave and RF + # Fetches the main page of Microwaves and RF soup = self.index_to_soup(self.site) + # First page has the ad, Let's find the redirect address. + url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href') + if url.startswith('/'): + url = self.site + url + + soup = self.index_to_soup(url) + # Searches the site for Issue ID link then returns the href address # pointing to the latest issue latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') diff --git a/recipes/satmagazine.recipe b/recipes/satmagazine.recipe new file mode 100644 index 0000000000..3e4b1e1b19 --- /dev/null +++ b/recipes/satmagazine.recipe @@ -0,0 +1,155 @@ +#!/usr/bin/env python +## +## Title: SatMagazine +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## +## Written: Feb 2012 +## Last Edited: Mar 2012 +## + +# Feb 2012: Initial release + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' + +''' +satmagazine.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class SatMagazine(BasicNewsRecipe): + + title = u'SatMagazine' + description = u'North American Satellite Markets...' + publisher = 'Satnews Publishers' + publication_type = 'magazine' + INDEX = 'http://www.satmagazine.com/cgi-bin/display_edition.cgi' + __author__ = 'kiavash' + + language = 'en' + asciiize = True + timeout = 120 + simultaneous_downloads = 2 + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} + + keep_only_tags = [dict(name='span', attrs={'class':'story'})] + + no_stylesheets = True + remove_javascript = True + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks, href links and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r''), lambda h1: ''), + (re.compile(r''), lambda h2: ''), + (re.compile(r'float:.*?'), lambda h3: ''), + (re.compile(r'width:.*?px'), lambda h4: ''), + (re.compile(r'height:.*?px'), lambda h5: '') + ] + + def parse_index(self): + + article_info = [] + feeds = [] + + soup = self.index_to_soup(self.INDEX) + + # Find Cover image + cover = soup.find('img', src=True, alt='Cover Image') + if cover is not None: + self.cover_url = cover['src'] + self.log('Found Cover image:', self.cover_url) + + soup = soup.find('div', attrs={'id':'middlecontent'}) # main part of the site that has the articles + + #Find the Magazine date + ts = soup.find('span', attrs={'class':'master_heading'}) # contains the string with the date + ds = ' '.join(self.tag_to_string(ts).strip().split()[:2]) + self.log('Found Current Issue:', ds) + self.timefmt = ' [%s]'%ds + + #sections = soup.findAll('span', attrs={'class':'upper_heading'}) + + articles = soup.findAll('span', attrs={'class':'heading'}) + + descriptions = soup.findAll('span', attrs={'class':'story'}) + + title_number = 0 + + # Goes thru all the articles one by one and sort them out + for article in articles: + + title = self.tag_to_string(article) + url = article.find('a').get('href') + + self.log('\tFound article:', title, 'at', url) + desc = self.tag_to_string(descriptions[title_number]) + #self.log('\t\t', desc) + + article_info.append({'title':title, 'url':url, 'description':desc, + 'date':self.timefmt}) + + title_number = title_number + 1 + + if article_info: + feeds.append((self.title, article_info)) + + return feeds + + def preprocess_html(self, soup): + + # Finds all the images + for figure in soup.findAll('img', attrs = {'src' : True}): + + # if the image is an ad then remove it. + if (figure['alt'].find('_ad_') >=0) or (figure['alt'].find('_snipe_') >=0): + del figure['src'] + del figure['alt'] + del figure['border'] + del figure['hspace'] + del figure['vspace'] + del figure['align'] + del figure['size'] + figure.name = 'font' + continue + + figure['style'] = 'display:block' # adds /n before and after the image + + # Makes the title standing out + for title in soup.findAll('b'): + title.name = 'h3' + + # Removes all unrelated links + for link in soup.findAll('a', attrs = {'href': True}): + link.name = 'font' + del link['href'] + del link['target'] + + return soup diff --git a/recipes/sueddeutsche.recipe b/recipes/sueddeutsche.recipe index 4e683ef0a9..624321e730 100644 --- a/recipes/sueddeutsche.recipe +++ b/recipes/sueddeutsche.recipe @@ -11,7 +11,7 @@ class Sueddeutsche(BasicNewsRecipe): title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title description = 'News from Germany, Access to online content' # 2012-01-26 AGe __author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26 - publisher = 'Süddeutsche Zeitung' # 2012-01-26 AGe add + publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add category = 'news, politics, Germany' # 2012-01-26 AGe add timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a oldest_article = 7 diff --git a/recipes/sueddeutschezeitung.recipe b/recipes/sueddeutschezeitung.recipe index 3185fc0f8e..f38f80dd45 100644 --- a/recipes/sueddeutschezeitung.recipe +++ b/recipes/sueddeutschezeitung.recipe @@ -9,10 +9,10 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre import strftime class SueddeutcheZeitung(BasicNewsRecipe): - title = 'Süddeutsche Zeitung' + title = u'Süddeutsche Zeitung' __author__ = 'Darko Miletic' description = 'News from Germany. Access to paid content.' - publisher = 'Süddeutsche Zeitung' + publisher = u'Süddeutsche Zeitung' category = 'news, politics, Germany' no_stylesheets = True oldest_article = 2 diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index a15cb62aaf..7f258afdc9 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -502,3 +502,13 @@ tweak_book_prefer = 'epub' # negative number to increase or decrease the font size. change_book_details_font_size_by = 0 +#: Compile General Program Mode templates to Python +# Compiled general program mode templates are significantly faster than +# interpreted templates. Setting this tweak to True causes calibre to compile +# (in most cases) general program mode templates. Setting it to False causes +# calibre to use the old behavior -- interpreting the templates. Set the tweak +# to False if some compiled templates produce incorrect values. +# Default: compile_gpm_templates = True +# No compile: compile_gpm_templates = False +compile_gpm_templates = True + diff --git a/resources/images/lt.png b/resources/images/lt.png index c29efb9f88..d19222d93f 100644 Binary files a/resources/images/lt.png and b/resources/images/lt.png differ diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 0062137247..6ef1e528fe 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -81,7 +81,7 @@ class ANDROID(USBMS): 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12 : [0x0100, 0x226, 0x227], 0x4e21 : [0x0100, 0x226, 0x227, 0x231], - 0x4e22 : [0x0100, 0x226, 0x227], + 0x4e22 : [0x0100, 0x226, 0x227, 0x231], 0xb058 : [0x0222, 0x226, 0x227], 0x0ff9 : [0x0226], 0xdddd : [0x216], @@ -194,7 +194,8 @@ class ANDROID(USBMS): '__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL', 'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853', 'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD', - 'USB_2.0_DRIVER', 'I9100T', 'P999DW_SD_CARD', 'KTABLET_PC'] + 'USB_2.0_DRIVER', 'I9100T', 'P999DW_SD_CARD', 'KTABLET_PC', + 'FILE-CD_GADGET'] OSX_MAIN_MEM = 'Android Device Main Memory' diff --git a/src/calibre/ebooks/conversion/plugins/mobi_input.py b/src/calibre/ebooks/conversion/plugins/mobi_input.py index f56eb2002c..9d71b69891 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_input.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py @@ -7,6 +7,22 @@ import os from calibre.customize.conversion import InputFormatPlugin +def run_mobi_unpack(stream, options, log, accelerators): + from mobiunpack.mobi_unpack import Mobi8Reader + from calibre.customize.ui import plugin_for_input_format + from calibre.ptempfile import PersistentTemporaryDirectory + + wdir = PersistentTemporaryDirectory('_unpack_space') + m8r = Mobi8Reader(stream, wdir) + if m8r.isK8(): + epub_path = m8r.processMobi8() + epub_input = plugin_for_input_format('epub') + for opt in epub_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = m8r.getCodec() + return epub_input.convert(open(epub_path,'rb'), options, + 'epub', log, accelerators) + class MOBIInput(InputFormatPlugin): name = 'MOBI Input' @@ -18,23 +34,12 @@ class MOBIInput(InputFormatPlugin): accelerators): if os.environ.get('USE_MOBIUNPACK', None) is not None: - from calibre.ptempfile import PersistentTemporaryDirectory + pos = stream.tell() try: - from mobiunpack.mobi_unpack import Mobi8Reader - from calibre.customize.ui import plugin_for_input_format - - wdir = PersistentTemporaryDirectory('_unpack_space') - m8r = Mobi8Reader(stream, wdir) - if m8r.isK8(): - epub_path = m8r.processMobi8() - epub_input = plugin_for_input_format('epub') - for opt in epub_input.options: - setattr(options, opt.option.name, opt.recommended_value) - options.input_encoding = m8r.getCodec() - return epub_input.convert(open(epub_path,'rb'), options, - 'epub', log, accelerators) + return run_mobi_unpack(stream, options, log, accelerators) except Exception: log.exception('mobi_unpack code not working') + stream.seek(pos) from calibre.ebooks.mobi.reader.mobi6 import MobiReader from lxml import html @@ -52,7 +57,7 @@ class MOBIInput(InputFormatPlugin): mr.extract_content(u'.', parse_cache) if mr.kf8_type is not None: - log('Found KF8 MOBI') + log('Found KF8 MOBI of type %r'%mr.kf8_type) from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader return os.path.abspath(Mobi8Reader(mr, log)()) diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index f22015d71f..7288f095d7 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -18,9 +18,6 @@ class MOBIOutput(OutputFormatPlugin): file_type = 'mobi' options = set([ - OptionRecommendation(name='rescale_images', recommended_value=False, - help=_('Modify images to meet Palm device size limitations.') - ), OptionRecommendation(name='prefer_author_sort', recommended_value=False, level=OptionRecommendation.LOW, help=_('When present, use author sort field as author.') @@ -59,7 +56,16 @@ class MOBIOutput(OutputFormatPlugin): help=_('Enable sharing of book content via Facebook etc. ' ' on the Kindle. WARNING: Using this feature means that ' ' the book will not auto sync its last read position ' - ' on multiple devices. Complain to Amazon.')) + ' on multiple devices. Complain to Amazon.') + ), + OptionRecommendation(name='mobi_keep_original_images', + recommended_value=False, + help=_('By default calibre converts all images to JPEG format ' + 'in the output MOBI file. This is for maximum compatibility ' + 'as some older MOBI viewers have problems with other image ' + 'formats. This option tells calibre not to do this. ' + 'Useful if your document contains lots of GIF/PNG images that ' + 'become very large when converted to JPEG.')), ]) def check_for_periodical(self): @@ -167,12 +173,7 @@ class MOBIOutput(OutputFormatPlugin): mobimlizer(oeb, opts) self.check_for_periodical() write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') - from calibre.utils.config import tweaks - if tweaks.get('new_mobi_writer', True): - from calibre.ebooks.mobi.writer2.main import MobiWriter - MobiWriter - else: - from calibre.ebooks.mobi.writer import MobiWriter + from calibre.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter(opts, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 1b43b97b73..ce80486af8 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -9,16 +9,19 @@ import copy, traceback from calibre import prints from calibre.constants import DEBUG -from calibre.ebooks.metadata.book import SC_COPYABLE_FIELDS -from calibre.ebooks.metadata.book import SC_FIELDS_COPY_NOT_NULL -from calibre.ebooks.metadata.book import STANDARD_METADATA_FIELDS -from calibre.ebooks.metadata.book import TOP_LEVEL_IDENTIFIERS -from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS +from calibre.ebooks.metadata.book import (SC_COPYABLE_FIELDS, + SC_FIELDS_COPY_NOT_NULL, STANDARD_METADATA_FIELDS, + TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS) from calibre.library.field_metadata import FieldMetadata from calibre.utils.date import isoformat, format_date from calibre.utils.icu import sort_key from calibre.utils.formatter import TemplateFormatter +# Special sets used to optimize the performance of getting and setting +# attributes on Metadata objects +SIMPLE_GET = frozenset(STANDARD_METADATA_FIELDS - TOP_LEVEL_IDENTIFIERS) +SIMPLE_SET = frozenset(SIMPLE_GET - {'identifiers'}) + def human_readable(size, precision=2): """ Convert a size in bytes into megabytes """ return ('%.'+str(precision)+'f'+ 'MB') % ((size/(1024.*1024.)),) @@ -136,6 +139,8 @@ class Metadata(object): def __getattribute__(self, field): _data = object.__getattribute__(self, '_data') + if field in SIMPLE_GET: + return _data.get(field, None) if field in TOP_LEVEL_IDENTIFIERS: return _data.get('identifiers').get(field, None) if field == 'language': @@ -143,8 +148,6 @@ class Metadata(object): return _data.get('languages', [])[0] except: return NULL_VALUES['language'] - if field in STANDARD_METADATA_FIELDS: - return _data.get(field, None) try: return object.__getattribute__(self, field) except AttributeError: @@ -173,7 +176,11 @@ class Metadata(object): def __setattr__(self, field, val, extra=None): _data = object.__getattribute__(self, '_data') - if field in TOP_LEVEL_IDENTIFIERS: + if field in SIMPLE_SET: + if val is None: + val = copy.copy(NULL_VALUES.get(field, None)) + _data[field] = val + elif field in TOP_LEVEL_IDENTIFIERS: field, val = self._clean_identifier(field, val) identifiers = _data['identifiers'] identifiers.pop(field, None) @@ -188,10 +195,6 @@ class Metadata(object): if val and val.lower() != 'und': langs = [val] _data['languages'] = langs - elif field in STANDARD_METADATA_FIELDS: - if val is None: - val = copy.copy(NULL_VALUES.get(field, None)) - _data[field] = val elif field in _data['user_metadata'].iterkeys(): _data['user_metadata'][field]['#value#'] = val _data['user_metadata'][field]['#extra#'] = extra @@ -404,9 +407,19 @@ class Metadata(object): ''' if metadata is None: traceback.print_stack() - else: - for key in metadata: - self.set_user_metadata(key, metadata[key]) + return + + um = {} + for key, meta in metadata.iteritems(): + m = meta.copy() + if '#value#' not in m: + if m['datatype'] == 'text' and m['is_multiple']: + m['#value#'] = [] + else: + m['#value#'] = None + um[key] = m + _data = object.__getattribute__(self, '_data') + _data['user_metadata'].update(um) def set_user_metadata(self, field, metadata): ''' @@ -420,9 +433,11 @@ class Metadata(object): if metadata is None: traceback.print_stack() return - m = {} - for k in metadata: - m[k] = copy.copy(metadata[k]) + m = dict(metadata) + # Copying the elements should not be necessary. The objects referenced + # in the dict should not change. Of course, they can be replaced. + # for k,v in metadata.iteritems(): + # m[k] = copy.copy(v) if '#value#' not in m: if m['datatype'] == 'text' and m['is_multiple']: m['#value#'] = [] @@ -543,6 +558,7 @@ class Metadata(object): # Happens if x is not a text, is_multiple field # on self lstags = [] + self_tags = [] ot, st = map(frozenset, (lotags, lstags)) for t in st.intersection(ot): sidx = lstags.index(t) diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 911421a6ce..846015f491 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -9,16 +9,21 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \ 'Marshall T. Vandegrift ' __docformat__ = 'restructuredtext en' -import os, cStringIO +import os, cStringIO, imghdr from struct import pack, unpack from cStringIO import StringIO from calibre.ebooks import normalize -from calibre.ebooks.mobi import MobiError -from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN +from calibre.ebooks.mobi import MobiError, MAX_THUMB_DIMEN +from calibre.ebooks.mobi.utils import rescale_image from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.date import now as nowf +def is_image(ss): + if ss is None: + return False + return imghdr.what(None, ss[:200]) is not None + class StreamSlicer(object): def __init__(self, stream, start=0, stop=None): @@ -161,11 +166,10 @@ class MetadataUpdater(object): if id == 106: self.timestamp = content elif id == 201: - rindex, = self.cover_rindex, = unpack('>i', content) - if rindex > 0 : - self.cover_record = self.record(rindex + image_base) + rindex, = self.cover_rindex, = unpack('>I', content) + self.cover_record = self.record(rindex + image_base) elif id == 202: - rindex, = self.thumbnail_rindex, = unpack('>i', content) + rindex, = self.thumbnail_rindex, = unpack('>I', content) if rindex > 0 : self.thumbnail_record = self.record(rindex + image_base) @@ -416,17 +420,17 @@ class MetadataUpdater(object): except: pass else: - if self.cover_record is not None: + if is_image(self.cover_record): size = len(self.cover_record) cover = rescale_image(data, size) if len(cover) <= size: - cover += '\0' * (size - len(cover)) + cover += b'\0' * (size - len(cover)) self.cover_record[:] = cover - if self.thumbnail_record is not None: + if is_image(self.thumbnail_record): size = len(self.thumbnail_record) thumbnail = rescale_image(data, size, dimen=MAX_THUMB_DIMEN) if len(thumbnail) <= size: - thumbnail += '\0' * (size - len(thumbnail)) + thumbnail += b'\0' * (size - len(thumbnail)) self.thumbnail_record[:] = thumbnail return diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 91b6b571ec..c30545e6e1 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -1148,7 +1148,8 @@ class OPFCreator(Metadata): self.manifest = Manifest.from_paths(entries) self.manifest.set_basedir(self.base_path) - def create_manifest_from_files_in(self, files_and_dirs): + def create_manifest_from_files_in(self, files_and_dirs, + exclude=lambda x:False): entries = [] def dodir(dir): @@ -1156,7 +1157,7 @@ class OPFCreator(Metadata): root, files = spec[0], spec[-1] for name in files: path = os.path.join(root, name) - if os.path.isfile(path): + if os.path.isfile(path) and not exclude(path): entries.append((path, None)) for i in files_and_dirs: diff --git a/src/calibre/ebooks/mobi/__init__.py b/src/calibre/ebooks/mobi/__init__.py index 55bc030796..22e0c1388f 100644 --- a/src/calibre/ebooks/mobi/__init__.py +++ b/src/calibre/ebooks/mobi/__init__.py @@ -6,3 +6,8 @@ __copyright__ = '2008, Kovid Goyal ' class MobiError(Exception): pass + +MAX_THUMB_SIZE = 16 * 1024 +MAX_THUMB_DIMEN = (180, 240) + + diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 7f2695b5c4..b03448a63b 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -14,8 +14,9 @@ from lxml import html from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data, decode_tbs) + get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data def format_bytes(byts): @@ -151,6 +152,10 @@ class EXTHRecord(object): 117 : 'adult', 118 : 'retailprice', 119 : 'retailpricecurrency', + 121 : 'KF8 header section index', + 125 : 'KF8 resources (images/fonts) count', + 129 : 'KF8 cover URI', + 131 : 'KF8 unknown count', 201 : 'coveroffset', 202 : 'thumboffset', 203 : 'hasfakecover', @@ -169,9 +174,10 @@ class EXTHRecord(object): 503 : 'updatedtitle', }.get(self.type, repr(self.type)) - if self.name in ('coveroffset', 'thumboffset', 'hasfakecover', + if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', 'Creator Major Version', 'Creator Minor Version', - 'Creator Build Number', 'Creator Software', 'startreading'): + 'Creator Build Number', 'Creator Software', 'startreading'} or + self.type in {121, 125, 131}): self.data, = struct.unpack(b'>I', self.data) def __str__(self): @@ -338,9 +344,9 @@ class MOBIHeader(object): # {{{ ans.append('File version: %d'%self.file_version) ans.append('Reserved: %r'%self.reserved) ans.append('Secondary index record: %d (null val: %d)'%( - self.secondary_index_record, 0xffffffff)) + self.secondary_index_record, NULL_INDEX)) ans.append('Reserved2: %r'%self.reserved2) - ans.append('First non-book record (null value: %d): %d'%(0xffffffff, + ans.append('First non-book record (null value: %d): %d'%(NULL_INDEX, self.first_non_book_record)) ans.append('Full name offset: %d'%self.fullname_offset) ans.append('Full name length: %d bytes'%self.fullname_length) @@ -379,7 +385,7 @@ class MOBIHeader(object): # {{{ '(has indexing: %s) (has uncrossable breaks: %s)')%( bin(self.extra_data_flags), self.has_multibytes, self.has_indexing_bytes, self.has_uncrossable_breaks )) - ans.append('Primary index record (null value: %d): %d'%(0xffffffff, + ans.append('Primary index record (null value: %d): %d'%(NULL_INDEX, self.primary_index_record)) ans = '\n'.join(ans) @@ -1149,6 +1155,25 @@ class BinaryRecord(object): # {{{ # }}} +class FontRecord(object): # {{{ + + def __init__(self, idx, record): + self.raw = record.raw + name = '%06d'%idx + self.font = read_font_record(self.raw) + if self.font['err']: + raise ValueError('Failed to read font record: %s Headers: %s'%( + self.font['err'], self.font['headers'])) + self.payload = (self.font['font_data'] if self.font['font_data'] else + self.font['raw_data']) + self.name = '%s.%s'%(name, self.font['ext']) + + def dump(self, folder): + with open(os.path.join(folder, self.name), 'wb') as f: + f.write(self.payload) + +# }}} + class TBSIndexing(object): # {{{ def __init__(self, text_records, indices, doc_type): @@ -1382,7 +1407,7 @@ class MOBIFile(object): # {{{ self.index_header = self.index_record = None self.indexing_record_nums = set() pir = self.mobi_header.primary_index_record - if pir != 0xffffffff: + if pir != NULL_INDEX: self.index_header = IndexHeader(self.records[pir]) self.cncx = CNCX(self.records[ pir+2:pir+2+self.index_header.num_of_cncx_blocks], @@ -1393,7 +1418,7 @@ class MOBIFile(object): # {{{ pir+2+self.index_header.num_of_cncx_blocks)) self.secondary_index_record = self.secondary_index_header = None sir = self.mobi_header.secondary_index_record - if sir != 0xffffffff: + if sir != NULL_INDEX: self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) self.indexing_record_nums.add(sir) self.secondary_index_record = SecondaryIndexRecord( @@ -1404,12 +1429,13 @@ class MOBIFile(object): # {{{ ntr = self.mobi_header.number_of_text_records fntbr = self.mobi_header.first_non_book_record fii = self.mobi_header.first_image_index - if fntbr == 0xffffffff: + if fntbr == NULL_INDEX: fntbr = len(self.records) self.text_records = [TextRecord(r, self.records[r], self.mobi_header.extra_data_flags, decompress) for r in xrange(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] + self.font_records = [] image_index = 0 for i in xrange(fntbr, len(self.records)): if i in self.indexing_record_nums or i in self.huffman_record_nums: @@ -1419,13 +1445,15 @@ class MOBIFile(object): # {{{ fmt = None if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', - b'AUDI', b'VIDE'}: + b'AUDI', b'VIDE', b'FONT'}: try: width, height, fmt = identify_data(r.raw) except: pass if fmt is not None: self.image_records.append(ImageRecord(image_index, r, fmt)) + elif r.raw[:4] == b'FONT': + self.font_records.append(FontRecord(i, r)) else: self.binary_records.append(BinaryRecord(i, r)) @@ -1465,10 +1493,11 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ of.write(rec.raw) alltext += rec.raw of.seek(0) - root = html.fromstring(alltext.decode('utf-8')) - with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: - of.write(html.tostring(root, pretty_print=True, encoding='utf-8', - include_meta_content_type=True)) + if f.mobi_header.file_version < 8: + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) if f.index_header is not None: @@ -1490,7 +1519,7 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{ f.tbs_indexing.dump(ddir) for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), - ('binary', 'binary_records')]: + ('binary', 'binary_records'), ('font', 'font_records')]: tdir = os.path.join(ddir, tdir) os.mkdir(tdir) for rec in getattr(f, attr): diff --git a/src/calibre/ebooks/mobi/reader/headers.py b/src/calibre/ebooks/mobi/reader/headers.py index 8cff1360de..eaad81730d 100644 --- a/src/calibre/ebooks/mobi/reader/headers.py +++ b/src/calibre/ebooks/mobi/reader/headers.py @@ -27,29 +27,30 @@ class EXTHHeader(object): # {{{ self.has_fake_cover = True self.start_offset = None left = self.num_items + self.kf8_header = None while left > 0: left -= 1 - id, size = struct.unpack('>LL', raw[pos:pos + 8]) + idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size - if id >= 100 and id < 200: - self.process_metadata(id, content, codec) - elif id == 203: + if idx >= 100 and idx < 200: + self.process_metadata(idx, content, codec) + elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) - elif id == 201: + elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co - elif id == 202: + elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) - elif id == 501: + elif idx == 501: # cdetype pass - elif id == 502: + elif idx == 502: # last update time pass - elif id == 503: # Long title + elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the @@ -60,43 +61,45 @@ class EXTHHeader(object): # {{{ except: pass #else: - # print 'unknown record', id, repr(content) + # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities(title) - def process_metadata(self, id, content, codec): - if id == 100: - if self.mi.authors == [_('Unknown')]: + def process_metadata(self, idx, content, codec): + if idx == 100: + if self.mi.is_null('authors'): self.mi.authors = [] au = content.decode(codec, 'ignore').strip() self.mi.authors.append(au) if re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() - elif id == 101: + elif idx == 101: self.mi.publisher = content.decode(codec, 'ignore').strip() - elif id == 103: + elif idx == 103: self.mi.comments = content.decode(codec, 'ignore') - elif id == 104: + elif idx == 104: self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '') - elif id == 105: + elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in content.decode(codec, 'ignore').split(';')]) self.mi.tags = list(set(self.mi.tags)) - elif id == 106: + elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass - elif id == 108: - pass # Producer - elif id == 113: + elif idx == 108: + self.mi.book_producer = content.decode(codec, 'ignore').strip() + elif idx == 113: pass # ASIN or UUID - elif id == 116: + elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) + elif idx == 121: + self.kf8_header, = struct.unpack(b'>L', content) #else: - # print 'unhandled metadata record', id, repr(content) + # print 'unhandled metadata record', idx, repr(content) # }}} class BookHeader(object): diff --git a/src/calibre/ebooks/mobi/reader/markup.py b/src/calibre/ebooks/mobi/reader/markup.py index cb47297717..26583cf30c 100644 --- a/src/calibre/ebooks/mobi/reader/markup.py +++ b/src/calibre/ebooks/mobi/reader/markup.py @@ -154,6 +154,8 @@ def update_flow_links(mobi8_reader, resource_map, log): 'valid font in %s' % (num, tag)) else: replacement = '"%s"'%('../'+ href) + if href.endswith('.failed'): + replacement = '"%s"'%('failed-'+href) tag = font_index_pattern.sub(replacement, tag, 1) # process links to other css pieces diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 896a9ebc2a..962c38a0ba 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -105,29 +105,29 @@ class MobiReader(object): user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) self.name = self.name.decode(self.book_header.codec, 'replace') self.kf8_type = None - is_kf8 = self.book_header.mobi_version == 8 - if is_kf8: + k8i = getattr(self.book_header.exth, 'kf8_header', None) + + if self.book_header.mobi_version == 8: self.kf8_type = 'standalone' - else: # Check for joint mobi 6 and kf 8 file - KF8_BOUNDARY = b'BOUNDARY' - for i, x in enumerate(self.sections[:-1]): - sec = x[0] - if (len(sec) == len(KF8_BOUNDARY) and sec == - KF8_BOUNDARY): - try: - self.book_header = BookHeader(self.sections[i+1][0], - self.ident, user_encoding, self.log) - # The following are only correct in the Mobi 6 - # header not the Mobi 8 header - for x in ('first_image_index',): - setattr(self.book_header, x, getattr(bh, x)) - if hasattr(self.book_header, 'huff_offset'): - self.book_header.huff_offset += i + 1 - self.kf8_type = 'joint' - self.kf8_boundary = i - except: - self.book_header = bh - break + elif k8i is not None: # Check for joint mobi 6 and kf 8 file + try: + raw = self.sections[k8i-1][0] + except: + raw = None + if raw == b'BOUNDARY': + try: + self.book_header = BookHeader(self.sections[k8i][0], + self.ident, user_encoding, self.log) + # The following are only correct in the Mobi 6 + # header not the Mobi 8 header + for x in ('first_image_index',): + setattr(self.book_header, x, getattr(bh, x)) + if hasattr(self.book_header, 'huff_offset'): + self.book_header.huff_offset += k8i + self.kf8_type = 'joint' + self.kf8_boundary = k8i-1 + except: + self.book_header = bh def check_for_drm(self): if self.book_header.encryption_type != 0: diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index dbe027f521..f5421bc9ea 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, re, os, zlib, imghdr +import struct, re, os, imghdr from collections import namedtuple from itertools import repeat @@ -16,6 +16,7 @@ from calibre.ebooks.mobi.reader.index import read_index from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup from calibre.ebooks.metadata.opf2 import Guide, OPFCreator +from calibre.ebooks.mobi.utils import read_font_record Part = namedtuple('Part', 'num type filename start end aid') @@ -339,23 +340,16 @@ class Mobi8Reader(object): b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: pass # Ignore these records elif typ == b'FONT': - # fonts only exist in K8 ebooks - # Format: - # bytes 0 - 3: 'FONT' - # bytes 4 - 7: ?? Expanded size in bytes ?? - # bytes 8 - 11: ?? number of files ?? - # bytes 12 - 15: ?? offset to start of compressed data ?? (typically 0x00000018 = 24) - # bytes 16 - 23: ?? typically all 0x00 ?? Are these compression flags from zlib? - # The compressed data begins with 2 bytes of header and has 4 bytes of checksum at the end - data = data[26:-4] - uncompressed_data = zlib.decompress(data, -15) - hdr = uncompressed_data[0:4] - ext = 'dat' - if hdr == b'\0\1\0\0' or hdr == b'true' or hdr == b'ttcf': - ext = 'ttf' - href = "fonts/%05d.%s" % (fname_idx, ext) + font = read_font_record(data) + href = "fonts/%05d.%s" % (fname_idx, font['ext']) + if font['err']: + self.log.warn('Reading font record %d failed: %s'%( + fname_idx, font['err'])) + if font['headers']: + self.log.debug('Font record headers: %s'%font['headers']) with open(href.replace('/', os.sep), 'wb') as f: - f.write(uncompressed_data) + f.write(font['font_data'] if font['font_data'] else + font['raw_data']) else: imgtype = imghdr.what(None, data) if imgtype is None: @@ -379,7 +373,11 @@ class Mobi8Reader(object): opf = OPFCreator(os.getcwdu(), mi) opf.guide = guide - opf.create_manifest_from_files_in([os.getcwdu()]) + + def exclude(path): + return os.path.basename(path) == 'debug-raw.html' + + opf.create_manifest_from_files_in([os.getcwdu()], exclude=exclude) opf.create_spine(spine) opf.set_toc(toc) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index cc30991392..6ec86f77ee 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, string +import struct, string, imghdr, zlib from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail @@ -124,12 +124,18 @@ def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): to JPEG. Ensure the resultant image has a byte size less than maxsizeb. - If dimen is not None, generate a thumbnail of width=dimen, height=dimen + If dimen is not None, generate a thumbnail of + width=dimen, height=dimen or width, height = dimen (depending on the type + of dimen) Returns the image as a bytestring ''' if dimen is not None: - data = thumbnail(data, width=dimen, height=dimen, + if hasattr(dimen, '__len__'): + width, height = dimen + else: + width = height = dimen + data = thumbnail(data, width=width, height=height, compression_quality=90)[-1] else: # Replace transparent pixels with white pixels and convert to JPEG @@ -363,3 +369,127 @@ def to_base(num, base=32): ans.reverse() return ''.join(ans) +def mobify_image(data): + 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' + what = imghdr.what(None, data) + + if what == 'png': + im = Image() + im.load(data) + data = im.export('gif') + return data + +def read_zlib_header(header): + header = bytearray(header) + # See sec 2.2 of RFC 1950 for the zlib stream format + # http://www.ietf.org/rfc/rfc1950.txt + if (header[0]*256 + header[1])%31 != 0: + return None, 'Bad zlib header, FCHECK failed' + + cmf = header[0] & 0b1111 + cinfo = header[0] >> 4 + if cmf != 8: + return None, 'Unknown zlib compression method: %d'%cmf + if cinfo > 7: + return None, 'Invalid CINFO field in zlib header: %d'%cinfo + fdict = (header[1]&0b10000)>>5 + if fdict != 0: + return None, 'FDICT based zlib compression not supported' + wbits = cinfo + 8 + return wbits, None + + +def read_font_record(data, extent=1040): # {{{ + ''' + Return the font encoded in the MOBI FONT record represented by data. + The return value in a dict with fields raw_data, font_data, err, ext, + headers. + + :param extent: The number of obfuscated bytes. So far I have only + encountered files with 1040 obfuscated bytes. If you encounter an + obfuscated record for which this function fails, try different extent + values (easily automated). + + raw_data is the raw data in the font record + font_data is the decoded font_data or None if an error occurred + err is not None if some error occurred + ext is the font type (ttf for TrueType, dat for unknown and failed if an + error occurred) + headers is the list of decoded headers from the font record or None if + decoding failed + ''' + # Format: + # bytes 0 - 3: 'FONT' + # bytes 4 - 7: Uncompressed size + # bytes 8 - 11: flags + # bit 1 - zlib compression + # bit 2 - XOR obfuscated + # bytes 12 - 15: offset to start of compressed data + # bytes 16 - 19: length of XOR string + # bytes 19 - 23: offset to start of XOR data + # The zlib compressed data begins with 2 bytes of header and + # has 4 bytes of checksum at the end + ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed', + 'headers':None} + + try: + usize, flags, dstart, xor_len, xor_start = struct.unpack_from( + b'>LLLLL', data, 4) + except: + ans['err'] = 'Failed to read font record header fields' + return ans + font_data = data[dstart:] + ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len, + 'xor_start':xor_start, 'dstart':dstart} + + if flags & 0b10: + # De-obfuscate the data + key = bytearray(data[xor_start:xor_start+xor_len]) + buf = bytearray(font_data) + extent = len(font_data) if extent is None else extent + extent = min(extent, len(font_data)) + + for n in xrange(extent): + buf[n] ^= key[n%xor_len] # XOR of buf and key + + font_data = bytes(buf) + + if flags & 0b1: + # ZLIB compressed data + wbits, err = read_zlib_header(font_data[:2]) + if err is not None: + ans['err'] = err + return ans + adler32, = struct.unpack_from(b'>I', font_data, len(font_data) - 4) + try: + # remove two bytes of zlib header and 4 bytes of trailing checksum + # negative wbits indicates no standard gzip header + font_data = zlib.decompress(font_data[2:-4], -wbits, usize) + except Exception as e: + ans['err'] = 'Failed to zlib decompress font data (%s)'%e + return ans + + if len(font_data) != usize: + ans['err'] = 'Uncompressed font size mismatch' + return ans + + if False: + # For some reason these almost never match, probably Amazon has a + # buggy Adler32 implementation + sig = (zlib.adler32(font_data) & 0xffffffff) + if sig != adler32: + ans['err'] = ('Adler checksum did not match. Stored: %d ' + 'Calculated: %d')%(adler32, sig) + return ans + + ans['font_data'] = font_data + sig = font_data[:4] + ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'} + else 'otf' if sig == b'OTTO' else 'dat') + + return ans +# }}} + + + + diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py deleted file mode 100644 index 40e9eeedd0..0000000000 --- a/src/calibre/ebooks/mobi/writer.py +++ /dev/null @@ -1,2950 +0,0 @@ -''' -Write content to Mobipocket books. -''' - -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift and \ - Kovid Goyal ' - -from collections import defaultdict -import random -import re -from struct import pack -import time -from urlparse import urldefrag -from cStringIO import StringIO - -from calibre.ebooks import normalize -from calibre.ebooks.mobi.langcodes import iana2mobi -from calibre.ebooks.mobi.mobiml import MBP_NS -from calibre.ebooks.oeb.base import OEB_DOCS -from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES -from calibre.ebooks.oeb.base import XHTML -from calibre.ebooks.oeb.base import XHTML_NS -from calibre.ebooks.oeb.base import XML_NS -from calibre.ebooks.oeb.base import namespace -from calibre.ebooks.oeb.base import prefixname -from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.compression.palmdoc import compress_doc -from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail - -INDEXING = True -FCIS_FLIS = True -WRITE_PBREAKS = True - -# TODO: -# - Optionally rasterize tables - -EXTH_CODES = { - 'creator': 100, - 'publisher': 101, - 'description': 103, - 'identifier': 104, - 'subject': 105, - 'pubdate': 106, - 'date': 106, - 'review': 107, - 'contributor': 108, - 'rights': 109, - 'type': 111, - 'source': 112, - 'title': 503, - } - -RECORD_SIZE = 0x1000 - -UNCOMPRESSED = 1 -PALMDOC = 2 -HUFFDIC = 17480 - -PALM_MAX_IMAGE_SIZE = 63 * 1024 -OTHER_MAX_IMAGE_SIZE = 10 * 1024 * 1024 -MAX_THUMB_SIZE = 16 * 1024 -MAX_THUMB_DIMEN = (180, 240) - - -TAGX = { - 'chapter' : - '\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x00\x00\x00\x01', - 'subchapter' : - '\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x10\x00\x16\x01\x20\x00\x17\x01\x40\x00\x00\x00\x00\x01', - 'periodical' : - '\x00\x00\x00\x02\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x20\x00\x16\x01\x40\x00\x17\x01\x80\x00\x00\x00\x00\x01\x45\x01\x01\x00\x46\x01\x02\x00\x47\x01\x04\x00\x00\x00\x00\x01', - 'secondary_book':'\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00\x00\x01', - 'secondary_periodical':'\x00\x00\x00\x01\x01\x01\x01\x00\x0b\x03\x02\x00\x00\x00\x00\x01' - } - -INDXT = { - 'chapter' : '\x0f', - 'subchapter' : '\x1f', - 'article' : '\x3f', - 'chapter with subchapters': '\x6f', - 'periodical' : '\xdf', - 'section' : '\xff', - } - -def encode(data): - return data.encode('utf-8') - -# Almost like the one for MS LIT, but not quite. -DECINT_FORWARD = 0 -DECINT_BACKWARD = 1 -def decint(value, direction): - # Encode vwi - bytes = [] - while True: - b = value & 0x7f - value >>= 7 - bytes.append(b) - if value == 0: - break - if direction == DECINT_FORWARD: - bytes[0] |= 0x80 - elif direction == DECINT_BACKWARD: - bytes[-1] |= 0x80 - return ''.join(chr(b) for b in reversed(bytes)) - -def align_block(raw, multiple=4, pad='\0'): - extra = len(raw) % multiple - if extra == 0: return raw - return raw + pad*(multiple - extra) - -def rescale_image(data, maxsizeb, dimen=None): - if dimen is not None: - data = thumbnail(data, width=dimen[0], height=dimen[1], - compression_quality=90)[-1] - else: - # Replace transparent pixels with white pixels and convert to JPEG - data = save_cover_data_to(data, 'img.jpg', return_data=True) - if len(data) <= maxsizeb: - return data - orig_data = data - img = Image() - quality = 95 - - img.load(data) - while len(data) >= maxsizeb and quality >= 10: - quality -= 5 - img.set_compression_quality(quality) - data = img.export('jpg') - if len(data) <= maxsizeb: - return data - orig_data = data - - scale = 0.9 - while len(data) >= maxsizeb and scale >= 0.05: - img = Image() - img.load(orig_data) - w, h = img.size - img.size = (int(scale*w), int(scale*h)) - img.set_compression_quality(quality) - data = img.export('jpg') - scale -= 0.05 - return data - -class Serializer(object): # {{{ - NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} - - def __init__(self, oeb, images, write_page_breaks_after_item=True): - self.oeb = oeb - self.images = images - self.logger = oeb.logger - self.write_page_breaks_after_item = write_page_breaks_after_item - self.id_offsets = {} - self.href_offsets = defaultdict(list) - self.breaks = [] - buffer = self.buffer = StringIO() - buffer.write('') - self.serialize_head() - self.serialize_body() - buffer.write('') - self.fixup_links() - self.text = buffer.getvalue() - - def serialize_head(self): - buffer = self.buffer - buffer.write('') - if len(self.oeb.guide) > 0: - self.serialize_guide() - buffer.write('') - - def serialize_guide(self): - buffer = self.buffer - hrefs = self.oeb.manifest.hrefs - buffer.write('') - for ref in self.oeb.guide.values(): - # The Kindle decides where to open a book based on the presence of - # an item in the guide that looks like - # - path = urldefrag(ref.href)[0] - if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: - continue - - buffer.write('') - - buffer.write('') - - def serialize_href(self, href, base=None): - hrefs = self.oeb.manifest.hrefs - path, frag = urldefrag(urlnormalize(href)) - if path and base: - path = base.abshref(path) - if path and path not in hrefs: - return False - buffer = self.buffer - item = hrefs[path] if path else None - if item and item.spine_position is None: - return False - path = item.href if item else base.href - href = '#'.join((path, frag)) if frag else path - buffer.write('filepos=') - self.href_offsets[href].append(buffer.tell()) - buffer.write('0000000000') - return True - - def serialize_body(self): - buffer = self.buffer - self.anchor_offset = buffer.tell() - buffer.write('') - self.anchor_offset_kindle = buffer.tell() - spine = [item for item in self.oeb.spine if item.linear] - spine.extend([item for item in self.oeb.spine if not item.linear]) - for item in spine: - self.serialize_item(item) - buffer.write('') - - def serialize_item(self, item): - buffer = self.buffer - if not item.linear: - self.breaks.append(buffer.tell() - 1) - self.id_offsets[urlnormalize(item.href)] = buffer.tell() - # Kindle periodical articles are contained in a
tag - buffer.write('
') - for elem in item.data.find(XHTML('body')): - self.serialize_elem(elem, item) - # Kindle periodical article end marker - buffer.write('
') - if self.write_page_breaks_after_item: - buffer.write('') - buffer.write('
') - self.anchor_offset = None - - def serialize_elem(self, elem, item, nsrmap=NSRMAP): - buffer = self.buffer - if not isinstance(elem.tag, basestring) \ - or namespace(elem.tag) not in nsrmap: - return - tag = prefixname(elem.tag, nsrmap) - # Previous layers take care of @name - id = elem.attrib.pop('id', None) - if id: - href = '#'.join((item.href, id)) - offset = self.anchor_offset or buffer.tell() - self.id_offsets[urlnormalize(href)] = offset - if self.anchor_offset is not None and \ - tag == 'a' and not elem.attrib and \ - not len(elem) and not elem.text: - return - self.anchor_offset = buffer.tell() - buffer.write('<') - buffer.write(tag) - if elem.attrib: - for attr, val in elem.attrib.items(): - if namespace(attr) not in nsrmap: - continue - attr = prefixname(attr, nsrmap) - buffer.write(' ') - if attr == 'href': - if self.serialize_href(val, item): - continue - elif attr == 'src': - href = urlnormalize(item.abshref(val)) - if href in self.images: - index = self.images[href] - buffer.write('recindex="%05d"' % index) - continue - buffer.write(attr) - buffer.write('="') - self.serialize_text(val, quot=True) - buffer.write('"') - buffer.write('>') - if elem.text or len(elem) > 0: - if elem.text: - self.anchor_offset = None - self.serialize_text(elem.text) - for child in elem: - self.serialize_elem(child, item) - if child.tail: - self.anchor_offset = None - self.serialize_text(child.tail) - buffer.write('' % tag) - - def serialize_text(self, text, quot=False): - text = text.replace('&', '&') - text = text.replace('<', '<') - text = text.replace('>', '>') - text = text.replace(u'\u00AD', '') # Soft-hyphen - if quot: - text = text.replace('"', '"') - self.buffer.write(encode(text)) - - def fixup_links(self): - buffer = self.buffer - id_offsets = self.id_offsets - for href, hoffs in self.href_offsets.items(): - if href not in id_offsets: - self.logger.warn('Hyperlink target %r not found' % href) - href, _ = urldefrag(href) - if href in self.id_offsets: - ioff = self.id_offsets[href] - for hoff in hoffs: - buffer.seek(hoff) - buffer.write('%010d' % ioff) - - # }}} - -class MobiWriter(object): - COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') - - def __init__(self, opts, - write_page_breaks_after_item=True): - self.opts = opts - self.write_page_breaks_after_item = write_page_breaks_after_item - self._compression = UNCOMPRESSED if getattr(opts, 'dont_compress', - False) else PALMDOC - self._imagemax = (PALM_MAX_IMAGE_SIZE if getattr(opts, - 'rescale_images', False) else OTHER_MAX_IMAGE_SIZE) - self._prefer_author_sort = getattr(opts, 'prefer_author_sort', False) - self._primary_index_record = None - self._conforming_periodical_toc = False - self._indexable = False - self._ctoc = "" - self._ctoc_records = [] - self._ctoc_offset = 0 - self._ctoc_largest = 0 - self._HTMLRecords = [] - self._tbSequence = "" - self._MobiDoc = None - self._anchor_offset_kindle = 0 - self._initialIndexRecordFound = False - self._firstSectionConcluded = False - self._currentSectionIndex = 0 - - @classmethod - def generate(cls, opts): - """Generate a Writer instance from command-line options.""" - imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None - prefer_author_sort = opts.prefer_author_sort - return cls(compression=PALMDOC, imagemax=imagemax, - prefer_author_sort=prefer_author_sort) - - def __call__(self, oeb, path): - if hasattr(path, 'write'): - return self._dump_stream(oeb, path) - with open(path, 'w+b') as stream: - return self._dump_stream(oeb, stream) - - def _write(self, * data): - for datum in data: - self._stream.write(datum) - - def _tell(self): - return self._stream.tell() - - def _dump_stream(self, oeb, stream): - self._oeb = oeb - self._stream = stream - self._records = [None] - self._generate_content() - self._generate_record0() - self._write_header() - self._write_content() - - def _generate_content(self): - self._map_image_names() - self._generate_text() - - if INDEXING and self._indexable : - try: - self._generate_index() - except: - self._oeb.log.exception('Failed to generate index') - - self._generate_images() - - def _map_image_names(self): - index = 1 - self._images = images = {} - mh_href = None - - if 'masthead' in self._oeb.guide: - mh_href = self._oeb.guide['masthead'].href - images[mh_href] = 1 - index += 1 - - for item in self._oeb.manifest.values(): - if item.media_type in OEB_RASTER_IMAGES: - if item.href == mh_href: continue - images[item.href] = index - index += 1 - - def _read_text_record(self, text): - pos = text.tell() - text.seek(0, 2) - npos = min((pos + RECORD_SIZE, text.tell())) - last = '' - while not last.decode('utf-8', 'ignore'): - size = len(last) + 1 - text.seek(npos - size) - last = text.read(size) - extra = 0 - try: - last.decode('utf-8') - except UnicodeDecodeError: - prev = len(last) - while True: - text.seek(npos - prev) - last = text.read(len(last) + 1) - try: - last.decode('utf-8') - except UnicodeDecodeError: - pass - else: - break - extra = len(last) - prev - text.seek(pos) - data = text.read(RECORD_SIZE) - overlap = text.read(extra) - text.seek(npos) - return data, overlap - - # TBS {{{ - def _generate_flat_indexed_navpoints(self): - # Assemble a HTMLRecordData instance for each HTML record - # Return True if valid, False if invalid - self._oeb.logger.info('Indexing flat navPoints ...') - - numberOfHTMLRecords = ( self._content_length // RECORD_SIZE ) + 1 - - # Create a list of HTMLRecordData class instances - x = numberOfHTMLRecords - while x: - self._HTMLRecords.append(HTMLRecordData()) - x -= 1 - - toc = self._oeb.toc - myIndex = 0 - myEndingRecord = 0 - previousOffset = 0 - previousLength = 0 - offset = 0 - length = 0 - entries = list(toc.iter())[1:] - - # Get offset, length per entry - for (i, child) in enumerate(entries): - if not child.title or not child.title.strip(): - child.title = "(none)" - - if not child.title or not child.title.strip(): - child.title = "(none)" - - h = child.href - if h not in self._id_offsets: - self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title) - return False - offset = self._id_offsets[h] - - length = None - - for sibling in entries[i+1:]: - h2 = sibling.href - if h2 in self._id_offsets: - offset2 = self._id_offsets[h2] - if offset2 > offset: - length = offset2 - offset - break - - if length is None: - length = self._content_length - offset - - if self.opts.verbose > 3 : - self._oeb.logger.info("child %03d: %s" % (i, child)) - self._oeb.logger.info(" title: %s" % child.title) - self._oeb.logger.info(" depth: %d" % child.depth()) - self._oeb.logger.info(" offset: 0x%06X \tlength: 0x%06X \tnext: 0x%06X" % (offset, length, offset + length)) - - # Look a gap between chapter nodes. Don't evaluate periodical or section nodes - if (i and child.depth() == 1 and entries[i-1].depth() == 1) : - if offset != previousOffset + previousLength : - self._oeb.log.warning("*** TOC discontinuity ***") - self._oeb.log.warning(" node %03d: '%s' offset: 0x%X length: 0x%X" % \ - (i-1, entries[i-1].title, previousOffset, previousLength) ) - self._oeb.log.warning(" node %03d: '%s' offset: 0x%X != 0x%06X" % \ - (i, child.title, offset, previousOffset + previousLength) ) - self._oeb.log.warning('_generate_flat_indexed_navpoints: Failed to generate index') - # Zero out self._HTMLRecords, return False - self._HTMLRecords = [] - #last_name = None - return False - - previousOffset = offset - previousLength = length - - # Calculate the HTML record for this entry - myStartingRecord = offset // RECORD_SIZE - - # If no one has taken the openingNode slot, it must be us - if self._HTMLRecords[myStartingRecord].openingNode == -1 : - self._HTMLRecords[myStartingRecord].openingNode = myIndex - - # Bump the node count for this HTML record - # Special case if we're the first so we get a true node count - if self._HTMLRecords[myStartingRecord].currentSectionNodeCount == -1: - self._HTMLRecords[myStartingRecord].currentSectionNodeCount = 1 - else: - self._HTMLRecords[myStartingRecord].currentSectionNodeCount += 1 - - # Calculate the ending HTMLRecord of this entry - myEndingRecord = (offset + length) // RECORD_SIZE - - if myEndingRecord > myStartingRecord : - interimSpanRecord = myStartingRecord + 1 - while interimSpanRecord <= myEndingRecord : - self._HTMLRecords[interimSpanRecord].continuingNode = myIndex - self._HTMLRecords[interimSpanRecord].currentSectionNodeCount = 1 - interimSpanRecord += 1 - if self.opts.verbose > 3 :self._oeb.logger.info(" node %03d: %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \ - (myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, interimSpanRecord, offset, length) ) - else : - if self.opts.verbose > 3 : self._oeb.logger.info(" node %03d: %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \ - (myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, myStartingRecord, offset, length) ) - - myIndex += 1 - - # Successfully parsed the entries - return True - - def _generate_indexed_navpoints(self): - # Assemble a HTMLRecordData instance for each HTML record - # Return True if valid, False if invalid - self._oeb.logger.info('Indexing navPoints ...') - - numberOfHTMLRecords = ( self._content_length // RECORD_SIZE ) + 1 - - # Create a list of HTMLRecordData class instances - x = numberOfHTMLRecords - while x: - self._HTMLRecords.append(HTMLRecordData()) - x -= 1 - - toc = self._oeb.toc - myIndex = 0 - myEndingRecord = 0 - previousOffset = 0 - previousLength = 0 - offset = 0 - length = 0 - sectionChangedInRecordNumber = -1 - sectionChangesInThisRecord = False - entries = list(toc.iter())[1:] - - # Get offset, length per entry - for (firstSequentialNode, node) in enumerate(list(self._ctoc_map)) : - if node['klass'] != 'article' and node['klass'] != 'chapter' : - # Skip periodical and section entries - continue - else : - if self.opts.verbose > 3 :self._oeb.logger.info("\tFirst sequential node: %03d" % firstSequentialNode) - break - - for i, child in enumerate(entries): - # Entries continues with a stream of section+articles, section+articles ... - h = child.href - if h not in self._id_offsets: - self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title) - return False - offset = self._id_offsets[h] - - length = None - - for sibling in entries[i+1:]: - h2 = sibling.href - if h2 in self._id_offsets: - offset2 = self._id_offsets[h2] - if offset2 > offset: - length = offset2 - offset - break - - if length is None: - length = self._content_length - offset - - if self.opts.verbose > 3 : - self._oeb.logger.info("child %03d: %s" % (i, child)) - self._oeb.logger.info(" title: %s" % child.title) - self._oeb.logger.info(" depth: %d" % child.depth()) - self._oeb.logger.info(" offset: 0x%06X \tlength: 0x%06X \tnext: 0x%06X" % (offset, length, offset + length)) - - # Look a gap between nodes, articles/chapters only, as - # periodical and section lengths cover spans of articles - if (i>firstSequentialNode) and self._ctoc_map[i-1]['klass'] != 'section': - if offset != previousOffset + previousLength : - self._oeb.log.warning("*** TOC discontinuity: nodes are not sequential ***") - self._oeb.log.info(" node %03d: '%s' offset: 0x%X length: 0x%X" % \ - (i-1, entries[i-1].title, previousOffset, previousLength) ) - self._oeb.log.warning(" node %03d: '%s' offset: 0x%X != 0x%06X" % \ - (i, child.title, offset, previousOffset + previousLength) ) - # self._oeb.log.warning("\tnode data %03d: %s" % (i-1, self._ctoc_map[i-1]) ) - # self._oeb.log.warning("\tnode data %03d: %s" % (i, self._ctoc_map[i]) ) - # Dump the offending entry - self._oeb.log.info("...") - for z in range(i-6 if i-6 > 0 else 0, i+6 if i+6 < len(entries) else len(entries)): - if z == i: - self._oeb.log.warning("child %03d: %s" % (z, entries[z])) - else: - self._oeb.log.info("child %03d: %s" % (z, entries[z])) - self._oeb.log.info("...") - - self._oeb.log.warning('_generate_indexed_navpoints: Failed to generate index') - # Zero out self._HTMLRecords, return False - self._HTMLRecords = [] - return False - - previousOffset = offset - previousLength = length - - # Calculate the HTML record for this entry - thisRecord = offset // RECORD_SIZE - - # Store the current continuingNodeParent and openingNodeParent - if self._ctoc_map[i]['klass'] == 'article': - if thisRecord > 0 : - if sectionChangesInThisRecord : # <<< - self._HTMLRecords[thisRecord].continuingNodeParent = self._currentSectionIndex - 1 - else : - self._HTMLRecords[thisRecord].continuingNodeParent = self._currentSectionIndex - - # periodical header? - if self._ctoc_map[i]['klass'] == 'periodical' : - # INCREMENT currentSectionNode count - # Commented out because structured docs don't count section changes in nodeCount - # compensation at 948 for flat periodicals - # self._HTMLRecords[thisRecord].currentSectionNodeCount = 1 - continue - - # Is this node a new section? - if self._ctoc_map[i]['klass'] == 'section' : - # INCREMENT currentSectionNode count - # Commented out because structured docs don't count section changes in nodeCount - # self._HTMLRecords[thisRecord].currentSectionNodeCount += 1 - - # *** This should check currentSectionNumber, because content could start late - if thisRecord > 0: - sectionChangesInThisRecord = True - #sectionChangesInRecordNumber = thisRecord - self._currentSectionIndex += 1 - self._HTMLRecords[thisRecord].nextSectionNumber = self._currentSectionIndex - # The following node opens the nextSection - self._HTMLRecords[thisRecord].nextSectionOpeningNode = myIndex - continue - else : - continue - - - # If no one has taken the openingNode slot, it must be us - # This could happen before detecting a section change - if self._HTMLRecords[thisRecord].openingNode == -1 : - self._HTMLRecords[thisRecord].openingNode = myIndex - self._HTMLRecords[thisRecord].openingNodeParent = self._currentSectionIndex - - # Bump the nextSection node count while we're in the same record - if sectionChangedInRecordNumber == thisRecord : - if self._ctoc_map[i]['klass'] == 'article' : - if self._HTMLRecords[thisRecord].nextSectionNodeCount == -1: - self._HTMLRecords[thisRecord].nextSectionNodeCount = 1 - else: - self._HTMLRecords[thisRecord].nextSectionNodeCount += 1 - else : - # Bump the currentSectionNodeCount one last time - self._HTMLRecords[thisRecord].currentSectionNodeCount += 1 - - else : - # Reset the change record - # sectionChangedInRecordNumber = -1 - sectionChangesInThisRecord = False - if self._HTMLRecords[thisRecord].currentSectionNodeCount == -1: - self._HTMLRecords[thisRecord].currentSectionNodeCount = 1 - else: - self._HTMLRecords[thisRecord].currentSectionNodeCount += 1 - - # Fill in the spanning records - myEndingRecord = (offset + length) // RECORD_SIZE - if myEndingRecord > thisRecord : - sectionChangesInThisRecord = False - interimSpanRecord = thisRecord + 1 - while interimSpanRecord <= myEndingRecord : - self._HTMLRecords[interimSpanRecord].continuingNode = myIndex - - self._HTMLRecords[interimSpanRecord].continuingNodeParent = self._currentSectionIndex - self._HTMLRecords[interimSpanRecord].currentSectionNodeCount = 1 - interimSpanRecord += 1 - - if self.opts.verbose > 3 :self._oeb.logger.info(" node: %03d %-10.10s %-15.15s... spans HTML records %03d-%03d \t offset: 0x%06X length: 0x%06X" % \ - (myIndex, self._ctoc_map[i]['klass'], child.title if child.title.strip() > "" else "(missing)", thisRecord, interimSpanRecord, offset, length) ) - elif thisRecord == numberOfHTMLRecords-1: - # Check for short terminating record (GR provisional) - if self._HTMLRecords[thisRecord].continuingNode == -1: - self._HTMLRecords[thisRecord].continuingNode = self._HTMLRecords[thisRecord].openingNode - 1 - else : - if self.opts.verbose > 3 : self._oeb.logger.info(" node: %03d %-10.10s %-15.15s... spans HTML records %03d-%03d \t offset: 0x%06X length: 0x%06X" % \ - (myIndex, self._ctoc_map[i]['klass'], child.title if child.title.strip() > "" else "(missing)", thisRecord, thisRecord, offset, length) ) - - myIndex += 1 - - # Successfully parsed the entries - return True - - def _generate_tbs_book(self, nrecords, lastrecord): - if self.opts.verbose > 3 :self._oeb.logger.info("Assembling TBS for Book: HTML record %03d of %03d" % \ - (nrecords, lastrecord) ) - # Variables for trailing byte sequence - tbsType = 0x00 - tbSequence = "" - - # Generate TBS for type 0x002 - mobi_book - if self._initialIndexRecordFound == False : - - # Is there any indexed content yet? - if self._HTMLRecords[nrecords].currentSectionNodeCount == -1 : - # No indexing data - write vwi length of 1 only - tbSequence = decint(len(tbSequence) + 1, DECINT_FORWARD) - - else : - # First indexed HTML record is a special case - # One or more nodes - self._initialIndexRecordFound = True - if self._HTMLRecords[nrecords].currentSectionNodeCount == 1 : - tbsType = 2 - else : - tbsType = 6 - - tbSequence = decint(tbsType, DECINT_FORWARD) - tbSequence += decint(0x00, DECINT_FORWARD) - # Don't write a nodecount for opening type 2 record - if tbsType != 2 : - # Check that <> -1 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) - - else : - # Determine tbsType for indexed HTMLRecords - if nrecords == lastrecord and self._HTMLRecords[nrecords].currentSectionNodeCount == 1 : - # Ending record with singleton node - tbsType = 2 - - elif self._HTMLRecords[nrecords].continuingNode > 0 and self._HTMLRecords[nrecords].openingNode == -1 : - # This is a span-only record - tbsType = 3 - # Zero out the nodeCount with a pre-formed vwi - self._HTMLRecords[nrecords].currentSectionNodeCount = 0x80 - - else : - tbsType = 6 - - - # Shift the openingNode index << 3 - shiftedNCXEntry = self._HTMLRecords[nrecords].continuingNode << 3 - # Add the TBS type - shiftedNCXEntry |= tbsType - - # Assemble the TBS - tbSequence = decint(shiftedNCXEntry, DECINT_FORWARD) - tbSequence += decint(0x00, DECINT_FORWARD) - # Don't write a nodecount for terminating type 2 record - if tbsType != 2 : - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) - - self._tbSequence = tbSequence - - def _generate_tbs_flat_periodical(self, nrecords, lastrecord): - # Flat periodicals <0x102> have a single section for all articles - # Structured periodicals <0x101 | 0x103> have one or more sections with articles - # The first section TBS sequence is different for Flat and Structured - # This function is called once per HTML record - - # Variables for trailing byte sequence - tbsType = 0x00 - tbSequence = "" - - # Generate TBS for type 0x102 - mobi_feed - flat periodical - if self._initialIndexRecordFound == False : - # Is there any indexed content yet? - if self._HTMLRecords[nrecords].currentSectionNodeCount == -1 : - # No indexing data - write vwi length of 1 only - tbSequence = decint(len(tbSequence) + 1, DECINT_FORWARD) - - else : - # First indexed record: Type 6 with nodeCount only - self._initialIndexRecordFound = True - tbsType = 6 - tbSequence = decint(tbsType, DECINT_FORWARD) - tbSequence += decint(0x00, DECINT_FORWARD) - # nodeCount = 0xDF + 0xFF + n(0x3F) - need to add 2 because we didn't count them earlier - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount + 2) - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Flat Periodical: HTML record %03d of %03d, section %d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent ) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - else : - # An HTML record with nextSectionNumber = -1 has no section change in this record - # Default for flat periodicals with only one section - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Flat Periodical: HTML record %03d of %03d, section %d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent ) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - # First section has different Type values - # Determine tbsType for HTMLRecords > 0 - if nrecords == lastrecord and self._HTMLRecords[nrecords].currentSectionNodeCount == 1 : - # Ending record with singleton node - tbsType = 6 - - # Assemble the Type 6 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - - # Assemble arg3 - (article index +1) << 4 + flag: 1 = article spans this record - arg3 = self._HTMLRecords[nrecords].continuingNode - arg3 += 1 - arg3 <<= 4 - arg3 |= 0x0 #flags = 0 - tbSequence += decint(arg3, DECINT_FORWARD) # arg3 - - - # tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - elif self._HTMLRecords[nrecords].continuingNode > 0 and self._HTMLRecords[nrecords].openingNode == -1 : - # This is a span-only record - tbsType = 6 - # Zero out the nodeCount with a pre-formed vwi - self._HTMLRecords[nrecords].currentSectionNodeCount = 0x80 - - # Assemble the Type 6 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - # Assemble arg3 - article index << 3 + flag: 1 = article spans this record - arg3 = self._HTMLRecords[nrecords].continuingNode - # Add the index of the openingNodeParent to get the offset start - # We know that section 0 is at position 1, section 1 at index 2, etc. - arg3 += self._HTMLRecords[nrecords].continuingNodeParent + 1 - arg3 <<= 4 - arg3 |= 0x01 - tbSequence += decint(arg3, DECINT_FORWARD) # arg3 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - else : - tbsType = 7 - # Assemble the Type 7 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - tbSequence += decint(0x00, DECINT_FORWARD) # arg3 = 0x80 - # Assemble arg4 - article index << 4 + flag: 1 = article spans this record - arg4 = self._HTMLRecords[nrecords].continuingNode - # Add the index of the openingNodeParent to get the offset start - # We know that section 0 is at position 1, section 1 at index 2, etc. - arg4 += self._HTMLRecords[nrecords].continuingNodeParent + 1 - arg4 <<= 4 - arg4 |= 0x04 # 4: multiple nodes - tbSequence += decint(arg4, DECINT_FORWARD) # arg4 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - self._tbSequence = tbSequence - - def _generate_tbs_structured_periodical(self, nrecords, lastrecord): - # Structured periodicals <0x101 | 0x103> have one or more sections for all articles - # The first section TBS sequences is different for Flat and Structured - # This function is called once per HTML record - - # Variables for trailing byte sequence - tbsType = 0x00 - tbSequence = "" - - # Generate TBS for type 0x101/0x103 - structured periodical - if self._initialIndexRecordFound == False : - # Is there any indexed content yet? - if self._HTMLRecords[nrecords].currentSectionNodeCount == -1 : - # No indexing data - write vwi length of 1 only - tbSequence = decint(len(tbSequence) + 1, DECINT_FORWARD) - - else : - self._initialIndexRecordFound = True - - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Structured Periodical: HTML record %03d of %03d, section %d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent ) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - # First record only - tbsType = 6 - # Assemble the Type 6 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - # Assemble arg3: (section jump + article index) << 4 + flag: 1 = article spans this record - arg3 = self._sectionCount # Jump over the section group - arg3 += 0 # First article index = 0 - arg3 <<= 4 - arg3 |= 0x04 - tbSequence += decint(arg3, DECINT_FORWARD) # arg3 - - # Structured periodicals don't count periodical, section in nodeCount - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - else : - if self._firstSectionConcluded == False : - # Use type 6 & 7 until first section switch, then 2 - - if self._HTMLRecords[nrecords].nextSectionNumber == -1 : - # An HTML record with nextSectionNumber = -1 has no section change in this record - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Structured Periodical: HTML record %03d of %03d, section %d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent ) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - # First section has different Type values - # Determine tbsType for HTMLRecords > 0 - if nrecords == lastrecord and self._HTMLRecords[nrecords].currentSectionNodeCount == 1 : - # Ending record with singleton node - tbsType = 6 - - # Assemble the Type 6 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - # Assemble arg3: (section jump + article index) << 4 + flag: 1 = article spans this record - arg3 = self._sectionCount - arg3 += self._HTMLRecords[nrecords].continuingNode - arg3 <<= 4 - arg3 |= 0x04 - tbSequence += decint(arg3, DECINT_FORWARD) # arg3 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - elif self._HTMLRecords[nrecords].continuingNode > 0 and self._HTMLRecords[nrecords].openingNode == -1 : - # This is a span-only record - tbsType = 6 - # Zero out the nodeCount with a pre-formed vwi - self._HTMLRecords[nrecords].currentSectionNodeCount = 0x80 - - # Assemble the Type 6 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - # Assemble arg3: (section jump + article index) << 4 + flag: 1 = article spans this record - arg3 = self._sectionCount - arg3 += self._HTMLRecords[nrecords].continuingNode - arg3 <<= 4 - arg3 |= 0x01 - tbSequence += decint(arg3, DECINT_FORWARD) # arg3 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - else : - tbsType = 7 - # Assemble the Type 7 TBS - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += chr(2) # arg2 = 0x02 - tbSequence += decint(0x00, DECINT_FORWARD) # arg3 = 0x80 - # Assemble arg4: (section jump + article index) << 4 + flag: 1 = article spans this record - arg4 = self._sectionCount - arg4 += self._HTMLRecords[nrecords].continuingNode - arg4 <<= 4 - arg4 |= 0x04 # 4: multiple nodes - tbSequence += decint(arg4, DECINT_FORWARD) # arg4 - tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount) # nodeCount - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - - # Initial section switch from section 1 - elif self._HTMLRecords[nrecords].nextSectionNumber > 0 : - tbsType = 3 - - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Structured Periodical: HTML record %03d of %03d, switching sections %d-%d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent, self._HTMLRecords[nrecords].nextSectionNumber) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += decint(0x00, DECINT_FORWARD) # arg2 = 0x80 - - # Assemble arg3: Upper nybble: ending section index - # Lower nybble = flags for next section - 0 or 1 - arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4 - arg3Flags = 0 # 0: has nodes? - arg3 |= arg3Flags - tbSequence += decint(arg3, DECINT_FORWARD) - - # Assemble arg4: Upper nybble: continuingNode << 4 - # Lower nybble: flag: 0 = no starting nodes from previous section - # flag: 4 = starting nodes from previous section - - sectionBase = self._HTMLRecords[nrecords].continuingNodeParent - sectionDelta = self._sectionCount - sectionBase - 1 - articleOffset = self._HTMLRecords[nrecords].continuingNode + 1 - arg4 = (sectionDelta + articleOffset) << 4 - - arg4Flags = 0 - if self._HTMLRecords[nrecords].currentSectionNodeCount > 1 : - arg4Flags = 4 - else : - arg4Flags = 0 - arg4 |= arg4Flags - tbSequence += decint(arg4, DECINT_FORWARD) # arg4 - - # Write optional 4a if previous section node count > 1 - if arg4Flags == 4 : # arg4a - nodeCountValue = self._HTMLRecords[nrecords].currentSectionNodeCount - nodeCountValue = 0x80 if nodeCountValue == 0 else nodeCountValue - tbSequence += chr(nodeCountValue) - - # Write article2: not completely understood - arg5 = sectionDelta + articleOffset - if self._HTMLRecords[nrecords].currentSectionNodeCount < 2: - arg5 -= 1 - arg5 <<= 4 - arg5Flags = 8 - arg5 |= arg5Flags - tbSequence += decint(arg5, DECINT_FORWARD) # arg5 - - # Write first article of new section - #arg6 = self._sectionCount - 1 # We're now into the following section - #arg6 = self._HTMLRecords[nrecords].nextSectionNumber - arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode - arg6 <<= 4 - if self._HTMLRecords[nrecords].nextSectionNodeCount > 1 : - arg6Flags = 4 - else : - arg6Flags = 0 - arg6 |= arg6Flags - tbSequence += decint(arg6, DECINT_FORWARD) # arg5 - - # Write optional 6a if previous section node count > 1 - if arg6Flags == 4 : # arg4a - nodeCountValue = self._HTMLRecords[nrecords].nextSectionNodeCount - nodeCountValue = 0x80 if nodeCountValue == 0 else nodeCountValue - tbSequence += chr(nodeCountValue) - - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - self._firstSectionConcluded = True - else : - # After first section switch, use types 2 and 3 - if self._HTMLRecords[nrecords].nextSectionNumber == -1 : - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Structured Periodical: HTML record %03d of %03d, section %d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent ) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - tbsType = 2 - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - arg2 = self._HTMLRecords[nrecords].continuingNodeParent + 1 - arg2 <<= 4 - # Add flag = 1 if there are multiple nodes in this record - arg2Flags = 0 - if self._HTMLRecords[nrecords].currentSectionNodeCount > 0 : - arg2Flags = 1 - arg2 |= arg2Flags - tbSequence += decint(arg2, DECINT_FORWARD) - - if arg2Flags : - # Add an extra vwi 0x00 - tbSequence += decint(0x00, DECINT_FORWARD) # arg2Flags = 0x80 - - # arg3 - offset of continuingNode from sectionParent - arg3 = self._sectionCount - self._HTMLRecords[nrecords].continuingNodeParent # Total guess - arg3 += self._HTMLRecords[nrecords].continuingNode - arg3 <<= 4 - arg3Flags = 1 - if self._HTMLRecords[nrecords].currentSectionNodeCount > 0 : - arg3Flags = 4 - arg3 |= arg3Flags - tbSequence += decint(arg3, DECINT_FORWARD) - - if arg3Flags == 4 : - nodeCountValue = self._HTMLRecords[nrecords].currentSectionNodeCount - nodeCountValue = 0x80 if nodeCountValue == 0 else nodeCountValue - tbSequence += chr(nodeCountValue) - else : - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - else : - # Section switch when section > 1 - tbsType = 3 - - if self.opts.verbose > 2 : - self._oeb.logger.info("\nAssembling TBS for Structured Periodical: HTML record %03d of %03d, switching sections %d-%d" % \ - (nrecords, lastrecord, self._HTMLRecords[nrecords].continuingNodeParent, self._HTMLRecords[nrecords].nextSectionNumber) ) - self._HTMLRecords[nrecords].dumpData(nrecords, self._oeb) - - tbSequence = decint(tbsType, DECINT_FORWARD) # Type - tbSequence += decint(0x00, DECINT_FORWARD) # arg1 = 0x80 - tbSequence += decint(0x00, DECINT_FORWARD) # arg2 = 0x80 - - # arg3: continuingNodeParent section - # Upper nybble: ending section index - # Lower nybble = flags for next section - 0 or 1 - arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4 - arg3Flags = 0 # 0: has nodes? - arg3 |= arg3Flags - tbSequence += decint(arg3, DECINT_FORWARD) - - # Assemble arg4: Upper nybble: continuingNode << 4 - # Lower nybble: flag: 0 = no starting nodes from previous section - # flag: 4 = starting nodes from previous section - sectionBase = self._HTMLRecords[nrecords].continuingNodeParent - sectionDelta = self._sectionCount - sectionBase - 1 - articleOffset = self._HTMLRecords[nrecords].continuingNode + 1 - arg4 = (sectionDelta + articleOffset) << 4 - - arg4Flags = 0 - if self._HTMLRecords[nrecords].currentSectionNodeCount > 1 : - arg4Flags = 4 - else : - arg4Flags = 0 - arg4 |= arg4Flags - tbSequence += decint(arg4, DECINT_FORWARD) # arg4 - - # Write optional 4a if previous section node count > 1 - if arg4Flags == 4 : # arg4a - nodeCountValue = self._HTMLRecords[nrecords].currentSectionNodeCount - nodeCountValue = 0x80 if nodeCountValue == 0 else nodeCountValue - tbSequence += chr(nodeCountValue) - - # Write article2: not completely understood - arg5 = sectionDelta + articleOffset - if self._HTMLRecords[nrecords].currentSectionNodeCount < 2: - arg5 -= 1 - arg5 <<= 4 - arg5Flags = 8 - arg5 |= arg5Flags - tbSequence += decint(arg5, DECINT_FORWARD) # arg5 - - # Write first article of new section - arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode - arg6 <<= 4 - if self._HTMLRecords[nrecords].nextSectionNodeCount > 1 : - arg6Flags = 4 - else : - arg6Flags = 0 - arg6 |= arg6Flags - tbSequence += decint(arg6, DECINT_FORWARD) # arg5 - - # Write optional 6a if previous section node count > 1 - if arg6Flags == 4 : # arg4a - nodeCountValue = self._HTMLRecords[nrecords].nextSectionNodeCount - nodeCountValue = 0x80 if nodeCountValue == 0 else nodeCountValue - tbSequence += chr(nodeCountValue) - - tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD) # len - - self._tbSequence = tbSequence - - # }}} - - def _evaluate_periodical_toc(self): - ''' - Periodical: - depth=4 - depth=3 1 - depth=2 1 or more - depth=1 multiple - Book: - depth=2 - depth=1 multiple - ''' - toc = self._oeb.toc - nodes = list(toc.iter())[1:] - toc_conforms = True - for child in nodes: - if child.klass == "periodical" and child.depth() != 3 or \ - child.klass == "section" and child.depth() != 2 or \ - child.klass == "article" and child.depth() != 1 : - - self._oeb.logger.warn('Nonconforming TOC entry: "%s" found at depth %d' % \ - (child.klass, child.depth()) ) - self._oeb.logger.warn(" : '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ - (child.title, child.klass, child.depth(), child.play_order) ) - toc_conforms = False - - # We also need to know that we have a pubdate or timestamp in the metadata, which the Kindle needs - if self._oeb.metadata['date'] == [] and self._oeb.metadata['timestamp'] == [] : - self._oeb.logger.info('metadata missing date/timestamp') - toc_conforms = False - - if not 'masthead' in self._oeb.guide : - self._oeb.logger.info('mastheadImage missing from manifest') - toc_conforms = False - - self._oeb.logger.info("%s" % " TOC structure conforms" if toc_conforms else " TOC structure non-conforming") - return toc_conforms - - def _generate_text(self): - self._oeb.logger.info('Serializing markup content...') - serializer = Serializer(self._oeb, self._images, - write_page_breaks_after_item=self.write_page_breaks_after_item) - breaks = serializer.breaks - text = serializer.text - self._anchor_offset_kindle = serializer.anchor_offset_kindle - self._id_offsets = serializer.id_offsets - self._content_length = len(text) - self._text_length = len(text) - text = StringIO(text) - buf = [] - nrecords = 0 - lastrecord = (self._content_length // RECORD_SIZE ) - offset = 0 - - if self._compression != UNCOMPRESSED: - self._oeb.logger.info(' Compressing markup content...') - data, overlap = self._read_text_record(text) - - if not self.opts.mobi_periodical: - self._flatten_toc() - - # Evaluate toc for conformance - if self.opts.mobi_periodical : - self._oeb.logger.info(' MOBI periodical specified, evaluating TOC for periodical conformance ...') - self._conforming_periodical_toc = self._evaluate_periodical_toc() - - # This routine decides whether to build flat or structured based on self._conforming_periodical_toc - # self._ctoc = self._generate_ctoc() - - # There may be multiple CNCX records built below, but the last record is returned and should be stored - self._ctoc_records.append(self._generate_ctoc()) - - # Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop - toc = self._oeb.toc - entries = list(toc.iter())[1:] - - if len(entries) : - self._indexable = self._generate_indexed_navpoints() - else : - self._oeb.logger.info(' No entries found in TOC ...') - self._indexable = False - - if not self._indexable : - self._oeb.logger.info(' Writing unindexed mobi ...') - - while len(data) > 0: - if self._compression == PALMDOC: - data = compress_doc(data) - record = StringIO() - record.write(data) - # Write trailing muti-byte sequence if any - record.write(overlap) - record.write(pack('>B', len(overlap))) - - if WRITE_PBREAKS : - nextra = 0 - pbreak = 0 - running = offset - while breaks and (breaks[0] - offset) < RECORD_SIZE: - # .pop returns item, removes it from list - pbreak = (breaks.pop(0) - running) >> 3 - if self.opts.verbose > 2 : - self._oeb.logger.info('pbreak = 0x%X at 0x%X' % (pbreak, record.tell()) ) - encoded = decint(pbreak, DECINT_FORWARD) - record.write(encoded) - running += pbreak << 3 - nextra += len(encoded) - lsize = 1 - while True: - size = decint(nextra + lsize, DECINT_BACKWARD) - if len(size) == lsize: - break - lsize += 1 - record.write(size) - - # Write Trailing Byte Sequence - if INDEXING and self._indexable: - # Dispatch to different TBS generators based upon publication type - booktype = self._MobiDoc.mobiType - if booktype == 0x002 : - self._generate_tbs_book(nrecords, lastrecord) - elif booktype == 0x102 : - self._generate_tbs_flat_periodical(nrecords, lastrecord) - elif booktype == 0x101 or booktype == 0x103 : - self._generate_tbs_structured_periodical(nrecords, lastrecord) - else : - raise NotImplementedError('Indexing for mobitype 0x%X not implemented' % booktype) - - # Write the sequence - record.write(self._tbSequence) - - self._records.append(record.getvalue()) - buf.append(self._records[-1]) - nrecords += 1 - offset += RECORD_SIZE - data, overlap = self._read_text_record(text) - - if INDEXING: - extra = sum(map(len, buf))%4 - if extra == 0: - extra = 4 - self._records.append('\0'*(4-extra)) - nrecords += 1 - self._text_nrecords = nrecords - - def _generate_images(self): - self._oeb.logger.info('Serializing images...') - images = [(index, href) for href, index in self._images.items()] - images.sort() - self._first_image_record = None - for _, href in images: - item = self._oeb.manifest.hrefs[href] - try: - data = rescale_image(item.data, self._imagemax) - except: - self._oeb.logger.warn('Bad image file %r' % item.href) - continue - finally: - item.unload_data_from_memory() - self._records.append(data) - if self._first_image_record is None: - self._first_image_record = len(self._records)-1 - - def _generate_end_records(self): - if FCIS_FLIS : - # This adds the binary blobs of FLIS and FCIS, which don't seem to be necessary - self._flis_number = len(self._records) - self._records.append( - 'FLIS\0\0\0\x08\0\x41\0\0\0\0\0\0\xff\xff\xff\xff\0\x01\0\x03\0\0\0\x03\0\0\0\x01'+ - '\xff'*4) - fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' - fcis += pack('>I', self._text_length) - fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' - self._fcis_number = len(self._records) - self._records.append(fcis) - self._records.append('\xE9\x8E\x0D\x0A') - - else : - self._flis_number = len(self._records) - self._records.append('\xE9\x8E\x0D\x0A') - - def _generate_record0(self): - metadata = self._oeb.metadata - exth = self._build_exth() - last_content_record = len(self._records) - 1 - - ''' - if INDEXING and self._indexable: - self._generate_end_records() - ''' - self._generate_end_records() - - record0 = StringIO() - # The PalmDOC Header - record0.write(pack('>HHIHHHH', self._compression, 0, - self._text_length, - self._text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf) - uid = random.randint(0, 0xffffffff) - title = normalize(unicode(metadata.title[0])).encode('utf-8') - # The MOBI Header - - # 0x0 - 0x3 - record0.write('MOBI') - - # 0x4 - 0x7 : Length of header - # 0x8 - 0x11 : MOBI type - # type meaning - # 0x002 MOBI book (chapter - chapter navigation) - # 0x101 News - Hierarchical navigation with sections and articles - # 0x102 News feed - Flat navigation - # 0x103 News magazine - same as 0x101 - # 0xC - 0xF : Text encoding (65001 is utf-8) - # 0x10 - 0x13 : UID - # 0x14 - 0x17 : Generator version - - btype = self._MobiDoc.mobiType - - record0.write(pack('>IIIII', - 0xe8, btype, 65001, uid, 6)) - - # 0x18 - 0x1f : Unknown - record0.write('\xff' * 8) - - - # 0x20 - 0x23 : Secondary index record - if btype < 0x100 : - record0.write(pack('>I', 0xffffffff)) - elif btype > 0x100 and self._indexable : - if self._primary_index_record is None: - record0.write(pack('>I', 0xffffffff)) - else: - record0.write(pack('>I', self._primary_index_record + 2 + len(self._ctoc_records))) - else : - record0.write(pack('>I', 0xffffffff)) - - # 0x24 - 0x3f : Unknown - record0.write('\xff' * 28) - - # 0x40 - 0x43 : Offset of first non-text record - record0.write(pack('>I', - self._text_nrecords + 1)) - - # 0x44 - 0x4b : title offset, title length - record0.write(pack('>II', - 0xe8 + 16 + len(exth), len(title))) - - # 0x4c - 0x4f : Language specifier - record0.write(iana2mobi( - str(metadata.language[0]))) - - # 0x50 - 0x57 : Unknown - record0.write('\0' * 8) - - # 0x58 - 0x5b : Format version - # 0x5c - 0x5f : First image record number - record0.write(pack('>II', - 6, self._first_image_record if self._first_image_record else 0)) - - # 0x60 - 0x63 : First HUFF/CDIC record number - # 0x64 - 0x67 : Number of HUFF/CDIC records - # 0x68 - 0x6b : First DATP record number - # 0x6c - 0x6f : Number of DATP records - record0.write('\0' * 16) - - # 0x70 - 0x73 : EXTH flags - record0.write(pack('>I', 0x50)) - - # 0x74 - 0x93 : Unknown - record0.write('\0' * 32) - - # 0x94 - 0x97 : DRM offset - # 0x98 - 0x9b : DRM count - # 0x9c - 0x9f : DRM size - # 0xa0 - 0xa3 : DRM flags - record0.write(pack('>IIII', - 0xffffffff, 0xffffffff, 0, 0)) - - - # 0xa4 - 0xaf : Unknown - record0.write('\0'*12) - - # 0xb0 - 0xb1 : First content record number - # 0xb2 - 0xb3 : last content record number - # (Includes Image, DATP, HUFF, DRM) - record0.write(pack('>HH', 1, last_content_record)) - - # 0xb4 - 0xb7 : Unknown - record0.write('\0\0\0\x01') - - # 0xb8 - 0xbb : FCIS record number - if FCIS_FLIS : - # Write these if FCIS/FLIS turned on - # 0xb8 - 0xbb : FCIS record number - record0.write(pack('>I', self._fcis_number)) - - # 0xbc - 0xbf : Unknown (FCIS record count?) - record0.write(pack('>I', 1)) - - # 0xc0 - 0xc3 : FLIS record number - record0.write(pack('>I', self._flis_number)) - - # 0xc4 - 0xc7 : Unknown (FLIS record count?) - record0.write(pack('>I', 1)) - else : - # 0xb8 - 0xbb : FCIS record number - record0.write(pack('>I', 0xffffffff)) - - # 0xbc - 0xbf : Unknown (FCIS record count?) - record0.write(pack('>I', 0xffffffff)) - - # 0xc0 - 0xc3 : FLIS record number - record0.write(pack('>I', 0xffffffff)) - - # 0xc4 - 0xc7 : Unknown (FLIS record count?) - record0.write(pack('>I', 1)) - - # 0xc8 - 0xcf : Unknown - record0.write('\0'*8) - - # 0xd0 - 0xdf : Unknown - record0.write(pack('>IIII', 0xffffffff, 0, 0xffffffff, 0xffffffff)) - - # 0xe0 - 0xe3 : Extra record data - # Extra record data flags: - # - 0x1: <extra multibyte bytes><size> (?) - # - 0x2: <TBS indexing description of this HTML record><size> GR - # - 0x4: <uncrossable breaks><size> - # GR: Use 7 for indexed files, 5 for unindexed - # Setting bit 2 (0x4) disables <guide><reference type="start"> functionality - - trailingDataFlags = 1 - if self._indexable : - trailingDataFlags |= 2 - if WRITE_PBREAKS : - trailingDataFlags |= 4 - record0.write(pack('>I', trailingDataFlags)) - - # 0xe4 - 0xe7 : Primary index record - record0.write(pack('>I', 0xffffffff if self._primary_index_record is - None else self._primary_index_record)) - - record0.write(exth) - record0.write(title) - record0 = record0.getvalue() - self._records[0] = record0 + ('\0' * (1024*8)) - - def _build_exth(self): - oeb = self._oeb - exth = StringIO() - nrecs = 0 - for term in oeb.metadata: - if term not in EXTH_CODES: continue - code = EXTH_CODES[term] - items = oeb.metadata[term] - if term == 'creator': - if self._prefer_author_sort: - creators = [normalize(unicode(c.file_as or c)) for c in items] - else: - creators = [normalize(unicode(c)) for c in items] - items = ['; '.join(creators)] - for item in items: - data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item))) - if term == 'identifier': - if data.lower().startswith('urn:isbn:'): - data = data[9:] - elif item.scheme.lower() == 'isbn': - pass - else: - continue - data = data.encode('utf-8') - exth.write(pack('>II', code, len(data) + 8)) - exth.write(data) - nrecs += 1 - if term == 'rights' : - try: - rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8') - except: - rights = 'Unknown' - exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8)) - exth.write(rights) - nrecs += 1 - - # Write UUID as ASIN - uuid = None - from calibre.ebooks.oeb.base import OPF - for x in oeb.metadata['identifier']: - if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'): - uuid = unicode(x).split(':')[-1] - break - if uuid is None: - from uuid import uuid4 - uuid = str(uuid4()) - - if isinstance(uuid, unicode): - uuid = uuid.encode('utf-8') - exth.write(pack('>II', 113, len(uuid) + 8)) - exth.write(uuid) - nrecs += 1 - - # Write cdetype - if not self.opts.mobi_periodical: - data = 'EBOK' - exth.write(pack('>II', 501, len(data)+8)) - exth.write(data) - nrecs += 1 - - # Add a publication date entry - if oeb.metadata['date'] != [] : - datestr = str(oeb.metadata['date'][0]) - elif oeb.metadata['timestamp'] != [] : - datestr = str(oeb.metadata['timestamp'][0]) - - if datestr is not None: - exth.write(pack('>II',EXTH_CODES['pubdate'], len(datestr) + 8)) - exth.write(datestr) - nrecs += 1 - else: - raise NotImplementedError("missing date or timestamp needed for mobi_periodical") - - if oeb.metadata.cover and \ - unicode(oeb.metadata.cover[0]) in oeb.manifest.ids: - id = unicode(oeb.metadata.cover[0]) - item = oeb.manifest.ids[id] - href = item.href - if href in self._images: - index = self._images[href] - 1 - exth.write(pack('>III', 0xc9, 0x0c, index)) - exth.write(pack('>III', 0xcb, 0x0c, 0)) - nrecs += 2 - index = self._add_thumbnail(item) - if index is not None: - exth.write(pack('>III', 0xca, 0x0c, index - 1)) - nrecs += 1 - - exth = exth.getvalue() - trail = len(exth) % 4 - pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte - exth = ['EXTH', pack('>II', len(exth) + 12, nrecs), exth, pad] - return ''.join(exth) - - def _add_thumbnail(self, item): - try: - data = rescale_image(item.data, MAX_THUMB_SIZE, MAX_THUMB_DIMEN) - except IOError: - self._oeb.logger.warn('Bad image file %r' % item.href) - return None - manifest = self._oeb.manifest - id, href = manifest.generate('thumbnail', 'thumbnail.jpeg') - manifest.add(id, href, 'image/jpeg', data=data) - index = len(self._images) + 1 - self._images[href] = index - self._records.append(data) - return index - - def _write_header(self): - title = str(self._oeb.metadata.title[0]) - title = re.sub('[^-A-Za-z0-9]+', '_', title)[:31] - title = title + ('\0' * (32 - len(title))) - now = int(time.time()) - nrecords = len(self._records) - self._write(title, pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0), - 'BOOK', 'MOBI', pack('>IIH', (2*nrecords)-1, 0, nrecords)) - offset = self._tell() + (8 * nrecords) + 2 - for i, record in enumerate(self._records): - self._write(pack('>I', offset), '\0', pack('>I', 2*i)[1:]) - offset += len(record) - self._write('\0\0') - - def _write_content(self): - for record in self._records: - self._write(record) - - def _clean_text_value(self, text): - if text is not None and text.strip() : - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else : - text = "(none)".encode('utf-8') - return text - - def _compute_offset_length(self, i, node, entries) : - h = node.href - if h not in self._id_offsets: - self._oeb.log.warning('Could not find TOC entry:', node.title) - return -1, -1 - - offset = self._id_offsets[h] - length = None - # Calculate length based on next entry's offset - for sibling in entries[i+1:]: - h2 = sibling.href - if h2 in self._id_offsets: - offset2 = self._id_offsets[h2] - if offset2 > offset: - length = offset2 - offset - break - if length is None: - length = self._content_length - offset - return offset, length - - def _establish_document_structure(self) : - documentType = None - try : - klass = self._ctoc_map[0]['klass'] - except : - klass = None - - if klass == 'chapter' or klass == None : - documentType = 'book' - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiBook to self._MobiDoc") - self._MobiDoc.documentStructure = MobiBook() - - elif klass == 'periodical' : - documentType = klass - if self.opts.verbose > 2 : - self._oeb.logger.info("Adding a MobiPeriodical to self._MobiDoc") - self._MobiDoc.documentStructure = MobiPeriodical(self._MobiDoc.getNextNode()) - self._MobiDoc.documentStructure.startAddress = self._anchor_offset_kindle - else : - raise NotImplementedError('_establish_document_structure: unrecognized klass: %s' % klass) - return documentType - - # Index {{{ - - def _flatten_toc(self): - ''' - Flatten and re-order entries in TOC so that chapter to chapter jumping - never fails on the Kindle. - ''' - from calibre.ebooks.oeb.base import TOC - items = list(self._oeb.toc.iterdescendants()) - offsets = {i:self._id_offsets.get(i.href, -1) for i in items if i.href} - items = [i for i in items if offsets[i] > -1] - items.sort(key=lambda i:offsets[i]) - filt = [] - seen = set() - for i in items: - off = offsets[i] - if off in seen: continue - seen.add(off) - filt.append(i) - items = filt - newtoc = TOC() - for c, i in enumerate(items): - newtoc.add(i.title, i.href, play_order=c+1, id=str(c), - klass='chapter') - self._oeb.toc = newtoc - - def _generate_index(self): - self._oeb.log('Generating INDX ...') - self._primary_index_record = None - - # Build the NCXEntries and INDX - indxt, indxt_count, indices, last_name = self._generate_indxt() - - if last_name is None: - self._oeb.log.warn('Input document has no TOC. No index generated.') - return - - # Assemble the INDX0[0] and INDX1[0] output streams - indx1 = StringIO() - indx1.write('INDX'+pack('>I', 0xc0)) # header length - - # 0x8 - 0xb : Unknown - indx1.write('\0'*4) - - # 0xc - 0xf : Header type - indx1.write(pack('>I', 1)) - - # 0x10 - 0x13 : Unknown - indx1.write('\0'*4) - - # 0x14 - 0x17 : IDXT offset - # 0x18 - 0x1b : IDXT count - indx1.write(pack('>I', 0xc0+len(indxt))) - indx1.write(pack('>I', indxt_count + 1)) - - # 0x1c - 0x23 : Unknown - indx1.write('\xff'*8) - - # 0x24 - 0xbf - indx1.write('\0'*156) - indx1.write(indxt) - indx1.write(indices) - indx1 = indx1.getvalue() - - idxt0 = chr(len(last_name)) + last_name + pack('>H', indxt_count + 1) - idxt0 = align_block(idxt0) - indx0 = StringIO() - - if self._MobiDoc.mobiType == 0x002 : - tagx = TAGX['chapter'] - else : - tagx = TAGX['periodical'] - - tagx = align_block('TAGX' + pack('>I', 8 + len(tagx)) + tagx) - indx0_indices_pos = 0xc0 + len(tagx) + len(idxt0) - indx0_indices = align_block('IDXT' + pack('>H', 0xc0 + len(tagx))) - # Generate record header - header = StringIO() - - header.write('INDX') - header.write(pack('>I', 0xc0)) # header length - - # 0x08 - 0x0b : Unknown - header.write('\0'*4) - - # 0x0c - 0x0f : Header type - header.write(pack('>I', 0)) - - # 0x10 - 0x13 : Generator ID - # This value may impact the position of flagBits written in - # write_article_node(). Change with caution. - header.write(pack('>I', 6)) - - # 0x14 - 0x17 : IDXT offset - header.write(pack('>I', indx0_indices_pos)) - - # 0x18 - 0x1b : IDXT count - header.write(pack('>I', 1)) - - # 0x1c - 0x1f : Text encoding ? - # header.write(pack('>I', 650001)) - # GR: This needs to be either 0xFDE9 or 0x4E4 - header.write(pack('>I', 0xFDE9)) - - # 0x20 - 0x23 : Language code? - header.write(iana2mobi(str(self._oeb.metadata.language[0]))) - - # 0x24 - 0x27 : Number of TOC entries in INDX1 - header.write(pack('>I', indxt_count + 1)) - - # 0x28 - 0x2b : ORDT Offset - header.write('\0'*4) - - # 0x2c - 0x2f : LIGT offset - header.write('\0'*4) - - # 0x30 - 0x33 : Number of LIGT entries - header.write('\0'*4) - - # 0x34 - 0x37 : Number of ctoc[] blocks - header.write(pack('>I', len(self._ctoc_records))) - - # 0x38 - 0xb3 : Unknown (pad?) - header.write('\0'*124) - - # 0xb4 - 0xb7 : TAGX offset - header.write(pack('>I', 0xc0)) - - # 0xb8 - 0xbf : Unknown - header.write('\0'*8) - - header = header.getvalue() - - indx0.write(header) - indx0.write(tagx) - indx0.write(idxt0) - indx0.write(indx0_indices) - indx0 = indx0.getvalue() - - self._primary_index_record = len(self._records) - - # GR: handle multiple ctoc records - self._records.extend([indx0, indx1 ]) - for (i,ctoc_record) in enumerate(self._ctoc_records): - self._records.append(ctoc_record) - # print "adding %d of %d ctoc records" % (i+1, len(self._ctoc_records)) - - # Indexing for author/description fields in summary section - # Test for indexed periodical - only one that needs secondary index - if self._MobiDoc.mobiType > 0x100 : - # Write secondary index records - #tagx = TAGX['secondary_'+\ - # ('periodical' if self.opts.mobi_periodical else 'book')] - tagx = TAGX['secondary_'+'periodical'] - tagx_len = 8 + len(tagx) - - # generate secondary INDX0 - indx0 = StringIO() - indx0.write('INDX'+pack('>I', 0xc0)+'\0'*8) # header + 8x00 - indx0.write(pack('>I', 0x06)) # generator ID - indx0.write(pack('>I', 0xe8)) # IDXT offset - indx0.write(pack('>I', 1)) # IDXT entries - indx0.write(pack('>I', 65001)) # encoding - indx0.write('\xff'*4) # language - indx0.write(pack('>I', 4)) # IDXT Entries in INDX1 - indx0.write('\0'*4) # ORDT Offset - indx0.write('\0'*136) # everything up to TAGX offset - indx0.write(pack('>I', 0xc0)) # TAGX offset - indx0.write('\0'*8) # unknowns - indx0.write('TAGX'+pack('>I', tagx_len)+tagx) # TAGX - indx0.write('\x0D'+'mastheadImage' + '\x00\x04') # mastheadImage - indx0.write('IDXT'+'\x00\xd8\x00\x00') # offset plus pad - - # generate secondary INDX1 - indx1 = StringIO() - indx1.write('INDX' + pack('>I', 0xc0) + '\0'*4) # header + 4x00 - indx1.write(pack('>I', 1)) # blockType 1 - indx1.write(pack('>I', 0x00)) # unknown - indx1.write('\x00\x00\x00\xF0') # IDXT offset - indx1.write(pack('>I', 4)) # num of IDXT entries - indx1.write('\xff'*8) # encoding, language - indx1.write('\0'*(0xc0-indx1.tell())) # 00 to IDXT Entries @ 0xC0 - indx1.write('\0\x01\x80') # 1 - null - indx1.write('\x06'+'author' + '\x02\x80\x80\xc7') # author - indx1.write('\x0B'+'description' + '\x02\x80\x80\xc6') # description - indx1.write('\x0D'+'mastheadImage' + '\x02\x85\x80\xc5') # mastheadImage - indx1.write('IDXT'+'\x00\xc0\x00\xc3\x00\xce\x00\xde') # IDXT header - - # Write INDX0 and INDX1 to the stream - indx0, indx1 = indx0.getvalue(), indx1.getvalue() - self._records.extend((indx0, indx1)) - if self.opts.verbose > 3: - from tempfile import mkdtemp - import os - t = mkdtemp() - for i, n in enumerate(['sindx1', 'sindx0', 'ctoc', 'indx0', 'indx1']): - open(os.path.join(t, n+'.bin'), 'wb').write(self._records[-(i+1)]) - self._oeb.log.debug('Index records dumped to', t) - - # Index nodes {{{ - def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) : - pos = 0xc0 + indxt.tell() - indices.write(pack('>H', pos)) # Save the offset for IDXTIndices - name = "%04X"%count - indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['periodical']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] - indxt.write(chr(1)) # subType 1 - indxt.write(decint(offset, DECINT_FORWARD)) # offset - indxt.write(decint(length, DECINT_FORWARD)) # length - indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - - indxt.write(decint(0, DECINT_FORWARD)) # unknown byte - - indxt.write(decint(self._ctoc_map[index]['classOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - indxt.write(decint(firstSection, DECINT_FORWARD)) # first section in periodical - indxt.write(decint(lastSection, DECINT_FORWARD)) # first section in periodical - - indxt.write(decint(0, DECINT_FORWARD)) # 0x80 - - def _write_section_node(self, indxt, indices, myCtocMapIndex, index, offset, length, count, firstArticle, lastArticle, parentIndex) : - pos = 0xc0 + indxt.tell() - indices.write(pack('>H', pos)) # Save the offset for IDXTIndices - name = "%04X"%count - indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['section']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] - indxt.write(chr(0)) # subType 0 - indxt.write(decint(offset, DECINT_FORWARD)) # offset - indxt.write(decint(length, DECINT_FORWARD)) # length - indxt.write(decint(self._ctoc_map[myCtocMapIndex]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - - indxt.write(decint(1, DECINT_FORWARD)) # unknown byte - - indxt.write(decint(self._ctoc_map[myCtocMapIndex]['classOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - indxt.write(decint(parentIndex, DECINT_FORWARD)) # index of periodicalParent - indxt.write(decint(firstArticle, DECINT_FORWARD)) # first section in periodical - indxt.write(decint(lastArticle, DECINT_FORWARD)) # first section in periodical - - def _write_article_node(self, indxt, indices, index, offset, length, count, parentIndex) : - pos = 0xc0 + indxt.tell() - indices.write(pack('>H', pos)) # Save the offset for IDXTIndices - name = "%04X"%count - indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['article']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] - - hasAuthor = True if self._ctoc_map[index]['authorOffset'] else False - hasDescription = True if self._ctoc_map[index]['descriptionOffset'] else False - - # flagBits may be dependent upon the generatorID written at 0x10 in generate_index(). - # in INDX0. Mobigen uses a generatorID of 2 and writes these bits at positions 1 & 2; - # calibre uses a generatorID of 6 and writes the bits at positions 2 & 3. - flagBits = 0 - if hasAuthor : flagBits |= 0x4 - if hasDescription : flagBits |= 0x2 - indxt.write(pack('>B',flagBits)) # Author/description flags - indxt.write(decint(offset, DECINT_FORWARD)) # offset - - - indxt.write(decint(length, DECINT_FORWARD)) # length - indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - - indxt.write(decint(2, DECINT_FORWARD)) # unknown byte - - indxt.write(decint(self._ctoc_map[index]['classOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - indxt.write(decint(parentIndex, DECINT_FORWARD)) # index of periodicalParent - - # Optionally write the author and description fields - descriptionOffset = self._ctoc_map[index]['descriptionOffset'] - if descriptionOffset : - indxt.write(decint(descriptionOffset, DECINT_FORWARD)) - - authorOffset = self._ctoc_map[index]['authorOffset'] - if authorOffset : - indxt.write(decint(authorOffset, DECINT_FORWARD)) - - def _write_chapter_node(self, indxt, indices, index, offset, length, count): - # Writes an INDX1 NCXEntry of entryType 0x0F - chapter - if self.opts.verbose > 2: - # *** GR: Turn this off while I'm developing my code - #self._oeb.log.debug('Writing TOC node to IDXT:', node.title, 'href:', node.href) - pass - - pos = 0xc0 + indxt.tell() - indices.write(pack('>H', pos)) # Save the offset for IDXTIndices - name = "%04X"%count - indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['chapter']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] - indxt.write(decint(offset, DECINT_FORWARD)) # offset - indxt.write(decint(length, DECINT_FORWARD)) # length - indxt.write(decint(self._ctoc_map[index]['titleOffset'], DECINT_FORWARD)) # vwi title offset in CNCX - indxt.write(decint(0, DECINT_FORWARD)) # unknown byte - - # }}} - - - def _generate_section_indices(self, child, currentSection, myPeriodical, myDoc ) : - sectionTitles = list(child.iter())[1:] - sectionIndices = [] - sectionParents = [] - for (j, section) in enumerate(sectionTitles): - # iterate over just the sections - - if section.klass == 'periodical' : - # Write our index to the list - sectionIndices.append(currentSection) - - if self.opts.verbose > 3 : - self._oeb.logger.info("Periodical: %15.15s \tkls:%s \tdpt:%d ply:%03d" % \ - (section.title, section.klass, section.depth(), section.play_order) ) - - elif section.klass == 'section' : - # Add sections, save in list with original sequence number - myNewSection = myPeriodical.addSectionParent(myDoc, j) - sectionParents.append(myNewSection) - - # Bump the section # - currentSection += 1 - # Write our index to the list - sectionIndices.append(currentSection) - - if self.opts.verbose > 3 : - self._oeb.logger.info(" Section: %15.15s \tkls:%s \tdpt:%d ply:%03d \tindex:%d" % \ - (section.title, section.klass, section.depth(), section.play_order,j) ) - - elif section.klass == 'article' : - # Write our index to the list - sectionIndices.append(currentSection) - - else : - if self.opts.verbose > 3 : - self._oeb.logger.info( " Unrecognized class %s in structured document" % section.klass) - return sectionIndices, sectionParents - - def _generate_section_article_indices(self, i, section, entries, sectionIndices, sectionParents): - sectionArticles = list(section.iter())[1:] - # Iterate over the section's articles - - for (j, article) in enumerate(sectionArticles): - # Recompute offset and length for each article - offset, length = self._compute_offset_length(i, article, entries) - if self.opts.verbose > 2 : - self._oeb.logger.info( "article %02d: offset = 0x%06X length = 0x%06X" % (j, offset, length) ) - - ctoc_map_index = i + j + 1 - - #hasAuthor = self._ctoc_map[ctoc_map_index].get('authorOffset') - #hasDescription = self._ctoc_map[ctoc_map_index].get('descriptionOffset') - mySectionParent = sectionParents[sectionIndices[i-1]] - myNewArticle = MobiArticle(mySectionParent, offset, length, ctoc_map_index ) - mySectionParent.addArticle( myNewArticle ) - - def _add_book_chapters(self, myDoc, indxt, indices): - chapterCount = myDoc.documentStructure.chapterCount() - if self.opts.verbose > 3 : - self._oeb.logger.info("Writing %d chapters for mobitype 0x%03X" % (chapterCount, myDoc.mobiType)) - - for (c, chapter) in enumerate(list(myDoc.documentStructure.chapters)) : - index = chapter.myCtocMapIndex - self._write_chapter_node(indxt, indices, index, chapter.startAddress, chapter.length, c) - - last_name = "%04X"%c # Returned when done - return last_name, c - - def _add_periodical_flat_articles(self, myDoc, indxt, indices): - sectionParent = myDoc.documentStructure.sectionParents[0] - articleCount = len(sectionParent.articles) - if self.opts.verbose > 3 : - self._oeb.logger.info("Writing %d articles for mobitype 0x%03X" % (articleCount, myDoc.mobiType)) - - # Singleton periodical - index = 0 - offset = myDoc.documentStructure.startAddress - length = myDoc.documentStructure.length - c = 0 - firstSection = myDoc.documentStructure.firstSectionIndex - lastSection = myDoc.documentStructure.lastSectionIndex - self._write_periodical_node(indxt, indices, index, offset, length, c, firstSection, lastSection) - - # Singleton section - index += 1 - offset = sectionParent.startAddress - length = sectionParent.sectionLength - c += 1 - firstArticle = sectionParent.firstArticleIndex - lastArticle = sectionParent.lastArticleIndex - parentIndex = sectionParent.parentIndex - self._write_section_node(indxt, indices, sectionParent.myCtocMapIndex, index, offset, length, c, firstArticle, lastArticle, parentIndex) - - # articles - for article in list(sectionParent.articles): - index = article.myCtocMapIndex - offset = article.startAddress - length = article.articleLength - c += 1 - parentIndex = article.sectionParentIndex - self._write_article_node(indxt, indices, index, offset, length, c, parentIndex) - - last_name = "%04X" % c - return last_name, c - - def _add_periodical_structured_articles(self, myDoc, indxt, indices): - # Write NCXEntries for Structured Periodical - # <periodical> - # <section> - # <section> ... - # <article> - # <article> ... - - if self.opts.verbose > 2 : - self._oeb.logger.info( "Writing NCXEntries for mobiType 0x%03X" % myDoc.mobiType) - - sectionParent = myDoc.documentStructure.sectionParents[0] - #articleCount = len(sectionParent.articles) - - # Write opening periodical 0xDF entry - index = 0 - offset = myDoc.documentStructure.startAddress - length = myDoc.documentStructure.length - c = 0 - firstSection = myDoc.documentStructure.firstSectionIndex - lastSection = myDoc.documentStructure.lastSectionIndex - self._write_periodical_node(indxt, indices, index, offset, length, c, firstSection, lastSection) - - # Write each section 0xFF entry - sectionCount = firstSection - while sectionCount <= lastSection : - # section - sectionParent = myDoc.documentStructure.sectionParents[sectionCount - 1] - #articleCount = len(sectionParent.articles) - #index += 1 - offset = sectionParent.startAddress - length = sectionParent.sectionLength - c += 1 - firstArticle = sectionParent.firstArticleIndex - lastArticle = sectionParent.lastArticleIndex - parentIndex = sectionParent.parentIndex - self._write_section_node(indxt, indices, sectionParent.myCtocMapIndex, sectionCount, offset, length, c, firstArticle, lastArticle, parentIndex) - sectionCount += 1 - - # Write each article 0x3F entry - sectionCount = firstSection - while sectionCount <= lastSection : - # section - sectionParent = myDoc.documentStructure.sectionParents[sectionCount - 1] -# articleCount = len(sectionParent.articles) -# index += 1 -# offset = sectionParent.startAddress -# length = sectionParent.sectionLength -# c += 1 -# firstArticle = sectionParent.firstArticleIndex -# lastArticle = sectionParent.lastArticleIndex -# parentIndex = sectionParent.parentIndex -# add_section_node(index, offset, length, c, firstArticle, lastArticle, parentIndex) - - last_name = "%04X"%c - - # articles - for (i, article) in enumerate(list(sectionParent.articles)) : - if self.opts.verbose > 3 : - self._oeb.logger.info( "Adding section:article %d:%02d" % \ - (sectionParent.myIndex, i)) - index = article.myCtocMapIndex - offset = article.startAddress - length = article.articleLength - c += 1 - parentIndex = article.sectionParentIndex - self._write_article_node(indxt, indices, index, offset, length, c, parentIndex) - - last_name = "%04X"%c - - sectionCount += 1 - - return last_name, c - - def _generate_indxt(self): - # Assumption: child.depth() represents nestedness of the TOC. - # A flat document (book) has a depth of 2: - # <navMap> child.depth() = 2 - # <navPoint> Chapter child.depth() = 1 - # <navPoint> Chapter etc - # -or- - # A structured document (periodical) has a depth of 4 (Mobigen-prepped) - # <navMap> child.depth() = 4 - # <navPoint> Periodical child.depth() = 3 - # <navPoint> Section 1 child.depth() = 2 - # <navPoint> Article child.depth() = 1 - # <navPoint> Article(s) child.depth() = 1 - # <navpoint> Section 2 - - sectionIndices = [] - sectionParents = [] - currentSection = 0 # Starting section number - toc = self._oeb.toc - indxt, indices, c = StringIO(), StringIO(), 0 - - indices.write('IDXT') - last_name = None - - # 'book', 'periodical' or None - documentType = self._establish_document_structure() - myDoc = self._MobiDoc - - nodes = list(toc.iter())[0:1] - for (i, child) in enumerate(nodes) : - - if documentType == "periodical" : - myPeriodical = myDoc.documentStructure - if self.opts.verbose > 3 : - self._oeb.logger.info("\nDocument: %s \tkls:%s \tdpt:%d ply:%03d" % \ - (child.title, child.klass, child.depth(), child.play_order) ) - sectionIndices, sectionParents = \ - self._generate_section_indices(child, currentSection, myPeriodical, myDoc) - - elif documentType == "book" : - myBook = myDoc.documentStructure - - if self.opts.verbose > 3 : - self._oeb.logger.info("\nBook: %-19.19s \tkls:%s \tdpt:%d ply:%03d" % \ - (child.title, child.klass, child.depth(), child.play_order) ) - else : - if self.opts.verbose > 3 : - self._oeb.logger.info("unknown document type %12.12s \tdepth:%d" % (child.title, child.depth()) ) - - # Original code starts here - # test first node for depth/class - entries = list(toc.iter())[1:] - for (i, child) in enumerate(entries): - if not child.title or not child.title.strip(): - continue - - offset, length = self._compute_offset_length(i, child, entries) - - if child.klass == 'chapter' or \ - (not self.opts.mobi_periodical and child.klass == 'article') : - # create chapter object - confirm i + 0 is correct!! - myNewChapter = MobiChapter(myDoc.getNextNode(), offset, length, i) - myBook.addChapter(myNewChapter) - - # Diagnostic - try : - if self.opts.verbose > 3 : - self._oeb.logger.info( " Chapter: %-14.14s \tcls:%s \tdpt:%d ply:%03d \toff:0x%X \t:len0x%X" % \ - (child.title, child.klass, child.depth(), child.play_order, offset, length) ) - except : - if self.opts.verbose > 3 : - self._oeb.logger.info( " Chapter: %-14.14s \tclass:%s \tdepth:%d playOrder:%03d \toff:0x%X \t:len0x%X" % \ - ("(bad string)", child.klass, child.depth(), child.play_order, offset, length)) - - elif child.klass == 'section' and self.opts.mobi_periodical : - if self.opts.verbose > 3 : - self._oeb.logger.info("\n Section: %-15.15s \tkls:%s \tdpt:%d ply:%03d" % \ - (child.title, child.klass, child.depth(), child.play_order)) - self._generate_section_article_indices(i, child, entries, sectionIndices, sectionParents) - - if self.opts.verbose > 3 : - self._oeb.logger.info("") - - mobiType = myDoc.mobiType - if self.opts.verbose > 3 : - self._MobiDoc.dumpInfo() - - if mobiType == 0x02 : - last_name, c = self._add_book_chapters(myDoc, indxt, indices) - - elif mobiType == 0x102 and myDoc.documentStructure.sectionCount() == 1 : - last_name, c = self._add_periodical_flat_articles(myDoc, indxt, indices) - - else : - last_name, c = self._add_periodical_structured_articles(myDoc, indxt, indices) - - return align_block(indxt.getvalue()), c, align_block(indices.getvalue()), last_name - # }}} - - # CTOC {{{ - def _add_to_ctoc(self, ctoc_str, record_offset): - # Write vwilen + string to ctoc - # Return offset - # Is there enough room for this string in the current ctoc record? - if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str): - # flush this ctoc, start a new one - # print "closing ctoc_record at 0x%X" % self._ctoc.tell() - # print "starting new ctoc with '%-50.50s ...'" % ctoc_str - # pad with 00 - pad = 0xfbf8 - self._ctoc.tell() - # print "padding %d bytes of 00" % pad - self._ctoc.write('\0' * (pad)) - self._ctoc_records.append(self._ctoc.getvalue()) - self._ctoc.truncate(0) - self._ctoc_offset += 0x10000 - record_offset = self._ctoc_offset - - offset = self._ctoc.tell() + record_offset - self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str) - return offset - - def _add_flat_ctoc_node(self, node, ctoc, title=None): - # Process 'chapter' or 'article' nodes only, force either to 'chapter' - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # article = chapter - if node.klass == 'article' : - ctoc_name_map['klass'] = 'chapter' - else : - ctoc_name_map['klass'] = node.klass - - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - return - - def _add_structured_ctoc_node(self, node, ctoc, title=None): - # Process 'periodical', 'section' and 'article' - - # Fetch the offset referencing the current ctoc_record - if node.klass is None : - return - t = node.title if title is None else title - t = self._clean_text_value(t) - self._last_toc_entry = t - - # Create an empty dictionary for this node - ctoc_name_map = {} - - # Add the klass of this node - ctoc_name_map['klass'] = node.klass - - if node.klass == 'chapter': - # Add title offset to name map - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - self._chapterCount += 1 - - elif node.klass == 'periodical' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'periodical' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'periodical': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._periodicalCount += 1 - - elif node.klass == 'section' : - # Add title offset - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'section' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'section': - # Use the pre-existing instance - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - self._sectionCount += 1 - - elif node.klass == 'article' : - # Add title offset/title - ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset) - - # Look for existing class entry 'article' in _ctoc_map - for entry in self._ctoc_map: - if entry['klass'] == 'article': - ctoc_name_map['classOffset'] = entry['classOffset'] - break - else : - continue - else: - # class names should always be in CNCX 0 - no offset - ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0) - - # Add description offset/description - if node.description : - d = self._clean_text_value(node.description) - ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset) - else : - ctoc_name_map['descriptionOffset'] = None - - # Add author offset/attribution - if node.author : - a = self._clean_text_value(node.author) - ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset) - else : - ctoc_name_map['authorOffset'] = None - - self._articleCount += 1 - - else : - raise NotImplementedError( \ - 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ - (node.title, node.klass, node.play_order)) - - # append this node's name_map to map - self._ctoc_map.append(ctoc_name_map) - - def _generate_ctoc(self): - # Generate the compiled TOC strings - # Each node has 1-4 CTOC entries: - # Periodical (0xDF) - # title, class - # Section (0xFF) - # title, class - # Article (0x3F) - # title, class, description, author - # Chapter (0x0F) - # title, class - # nb: Chapters don't actually have @class, so we synthesize it - # in reader._toc_from_navpoint - - toc = self._oeb.toc - reduced_toc = [] - self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets - self._last_toc_entry = None - #ctoc = StringIO() - self._ctoc = StringIO() - - # Track the individual node types - self._periodicalCount = 0 - self._sectionCount = 0 - self._articleCount = 0 - self._chapterCount = 0 - - #first = True - - if self._conforming_periodical_toc : - self._oeb.logger.info('Generating structured CTOC ...') - for (child) in toc.iter(): - if self.opts.verbose > 2 : - self._oeb.logger.info(" %s" % child) - self._add_structured_ctoc_node(child, self._ctoc) - #first = False - - else : - self._oeb.logger.info('Generating flat CTOC ...') - previousOffset = -1 - currentOffset = 0 - for (i, child) in enumerate(toc.iterdescendants()): - # Only add chapters or articles at depth==1 - # no class defaults to 'chapter' - if child.klass is None : child.klass = 'chapter' - if (child.klass == 'article' or child.klass == 'chapter') and child.depth() == 1 : - if self.opts.verbose > 2 : - self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ - (child.klass, child.depth(), child) ) - - # Test to see if this child's offset is the same as the previous child's - # offset, skip it - h = child.href - - if h is None: - self._oeb.logger.warn(' Ignoring TOC entry with no href:', - child.title) - continue - if h not in self._id_offsets: - self._oeb.logger.warn(' Ignoring missing TOC entry:', - unicode(child)) - continue - - currentOffset = self._id_offsets[h] - # print "_generate_ctoc: child offset: 0x%X" % currentOffset - - if currentOffset != previousOffset : - self._add_flat_ctoc_node(child, self._ctoc) - reduced_toc.append(child) - previousOffset = currentOffset - else : - self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) - - else : - if self.opts.verbose > 2 : - self._oeb.logger.info("skipping class: %s depth %d at position %d" % \ - (child.klass, child.depth(),i)) - - # Update the TOC with our edited version - self._oeb.toc.nodes = reduced_toc - - # Instantiate a MobiDocument(mobitype) - if (not self._periodicalCount and not self._sectionCount and not self._articleCount) or \ - not self.opts.mobi_periodical : - mobiType = 0x002 - elif self._periodicalCount: - pt = None - if self._oeb.metadata.publication_type: - x = unicode(self._oeb.metadata.publication_type[0]).split(':') - if len(x) > 1: - pt = x[1] - mobiType = {'newspaper':0x101}.get(pt, 0x103) - else : - raise NotImplementedError('_generate_ctoc: Unrecognized document structured') - - self._MobiDoc = MobiDocument(mobiType) - - if self.opts.verbose > 2 : - structType = 'book' - if mobiType > 0x100 : - structType = 'flat periodical' if mobiType == 0x102 else 'structured periodical' - self._oeb.logger.info("Instantiating a %s MobiDocument of type 0x%X" % (structType, mobiType ) ) - if mobiType > 0x100 : - self._oeb.logger.info("periodicalCount: %d sectionCount: %d articleCount: %d"% \ - (self._periodicalCount, self._sectionCount, self._articleCount) ) - else : - self._oeb.logger.info("chapterCount: %d" % self._chapterCount) - - # Apparently the CTOC must end with a null byte - self._ctoc.write('\0') - - ctoc = self._ctoc.getvalue() - rec_count = len(self._ctoc_records) - self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \ - (rec_count + 1, 'records, last record' if rec_count else 'record,', - len(ctoc)/655) ) - - return align_block(ctoc) - - # }}} - -class HTMLRecordData(object): - """ A data structure containing indexing/navigation data for an HTML record """ - def __init__(self): - self._continuingNode = -1 - self._continuingNodeParent = -1 - self._openingNode = -1 - self._openingNodeParent = -1 - self._currentSectionNodeCount = -1 - self._nextSectionNumber = -1 - self._nextSectionOpeningNode = -1 - self._nextSectionNodeCount = -1 - - def getContinuingNode(self): - return self._continuingNode - def setContinuingNode(self, value): - self._continuingNode = value - continuingNode = property(getContinuingNode, setContinuingNode, None, None) - - def getContinuingNodeParent(self): - return self._continuingNodeParent - def setContinuingNodeParent(self, value): - self._continuingNodeParent = value - continuingNodeParent = property(getContinuingNodeParent, setContinuingNodeParent, None, None) - - def getOpeningNode(self): - return self._openingNode - def setOpeningNode(self, value): - self._openingNode = value - openingNode = property(getOpeningNode, setOpeningNode, None, None) - - def getOpeningNodeParent(self): - return self._openingNodeParent - def setOpeningNodeParent(self, value): - self._openingNodeParent = value - openingNodeParent = property(getOpeningNodeParent, setOpeningNodeParent, None, None) - - def getCurrentSectionNodeCount(self): - return self._currentSectionNodeCount - def setCurrentSectionNodeCount(self, value): - self._currentSectionNodeCount = value - currentSectionNodeCount = property(getCurrentSectionNodeCount, setCurrentSectionNodeCount, None, None) - - def getNextSectionNumber(self): - return self._nextSectionNumber - def setNextSectionNumber(self, value): - self._nextSectionNumber = value - nextSectionNumber = property(getNextSectionNumber, setNextSectionNumber, None, None) - - def getNextSectionOpeningNode(self): - return self._nextSectionOpeningNode - def setNextSectionOpeningNode(self, value): - self._nextSectionOpeningNode = value - nextSectionOpeningNode = property(getNextSectionOpeningNode, setNextSectionOpeningNode, None, None) - - def getNextSectionNodeCount(self): - return self._nextSectionNodeCount - def setNextSectionNodeCount(self, value): - self._nextSectionNodeCount = value - nextSectionNodeCount = property(getNextSectionNodeCount, setNextSectionNodeCount, None, None) - - def dumpData(self, recordNumber, oeb): - oeb.logger.info( "--- Summary of HTML Record 0x%x [%d] indexing ---" % (recordNumber, recordNumber) ) - oeb.logger.info( " continuingNode: %03d" % self.continuingNode ) - oeb.logger.info( " continuingNodeParent: %03d" % self.continuingNodeParent ) - oeb.logger.info( " openingNode: %03d" % self.openingNode ) - oeb.logger.info( " openingNodeParent: %03d" % self.openingNodeParent ) - oeb.logger.info( " currentSectionNodeCount: %03d" % self.currentSectionNodeCount ) - oeb.logger.info( " nextSectionNumber: %03d" % self.nextSectionNumber ) - oeb.logger.info( " nextSectionOpeningNode: %03d" % self.nextSectionOpeningNode ) - oeb.logger.info( " nextSectionNodeCount: %03d" % self.nextSectionNodeCount ) - -class MobiDocument(object): - """ Hierarchical description of a Mobi document """ - - # Counter to assign index values as new nodes are created - _nextNode = -1 - - def __init__(self, mobitype): - self._mobitype = mobitype - self._documentStructure = None # Assigned in _generate_indxt - - def getMobiType(self): - return self._mobitype - def setMobiType(self, value): - self._mobitype = value - mobiType = property(getMobiType, setMobiType, None, None) - - def getDocumentStructure(self): - return self._documentStructure - def setDocumentStructure(self, value): - self._documentStructure = value - documentStructure = property(getDocumentStructure, setDocumentStructure, None, None) - - def getNextNode(self): - self._nextNode += 1 - return self._nextNode - - def dumpInfo(self): - self._documentStructure.dumpInfo() - -class MobiBook(object): - """ A container for a flat chapter-to-chapter Mobi book """ - def __init__(self): - self._chapters = [] - - def chapterCount(self): - return len(self._chapters) - - def getChapters(self): - return self._chapters - def setChapters(self, value): - self._chapters = value - chapters = property(getChapters, setChapters, None, None) - - def addChapter(self, value): - self._chapters.append(value) - - def dumpInfo(self): - print "%20s:" % ("Book") - print "%20s: %d" % ("Number of chapters", len(self._chapters)) - for (count, chapter) in enumerate(self._chapters): - print "%20s: %d" % ("myCtocMapIndex",chapter.myCtocMapIndex) - print "%20s: %d" % ("Chapter",count) - print "%20s: 0x%X" % ("startAddress", chapter.startAddress) - print "%20s: 0x%X" % ("length", chapter.length) - print - -class MobiChapter(object): - """ A container for Mobi chapters """ - def __init__(self, myIndex, startAddress, length, ctoc_map_index): - self._myIndex = myIndex - self._startAddress = startAddress - self._length = length - self._myCtocMapIndex = ctoc_map_index - - def getMyCtocMapIndex(self): - return self._myCtocMapIndex - def setMyCtocMapIndex(self, value): - self._myCtocMapIndex = value - myCtocMapIndex = property(getMyCtocMapIndex, setMyCtocMapIndex, None, None) - - def getMyIndex(self): - return self._myIndex - myIndex = property(getMyIndex, None, None, None) - - def getStartAddress(self): - return self._startAddress - def setStartAddress(self, value): - self._startAddress = value - startAddress = property(getStartAddress, setStartAddress, None, None) - - def getLength(self): - return self._length - def setLength(self, value): - self._length = value - length = property(getLength, setLength, None, None) - -class MobiPeriodical(object): - """ A container for a structured periodical """ - def __init__(self, myIndex): - self._myIndex = myIndex - self._sectionParents = [] - self._startAddress = 0xFFFFFFFF - self._length = 0xFFFFFFFF - self._firstSectionIndex = 0xFFFFFFFF - self._lastSectionIndex = 0xFFFFFFFF - self._myCtocMapIndex = 0 # Always first entry - - def getMyIndex(self): - return self._myIndex - def setMyIndex(self, value): - self._myIndex = value - myIndex = property(getMyIndex, setMyIndex, None, None) - - def getSectionParents(self): - return self._sectionParents - def setSectionParents(self, value): - self._sectionParents = value - sectionParents = property(getSectionParents, setSectionParents, None, None) - - def sectionCount(self): - return len(self._sectionParents) - - def getStartAddress(self): - return self._startAddress - def setStartAddress(self, value): - self._startAddress = value - startAddress = property(getStartAddress, setStartAddress, None, None) - - def getLength(self): - return self._length - def setLength(self, value): - self._length = value - length = property(getLength, setLength, None, None) - - def getFirstSectionIndex(self): - return self._firstSectionIndex - def setFirstSectionIndex(self, value): - self._firstSectionIndex = value - firstSectionIndex = property(getFirstSectionIndex, setFirstSectionIndex, None, None) - - def getLastSectionIndex(self): - return self._lastSectionIndex - def setLastSectionIndex(self, value): - self._lastSectionIndex = value - lastSectionIndex = property(getLastSectionIndex, setLastSectionIndex, None, None) - - def getMyCtocMapIndex(self): - return self._myCtocMapIndex - def setMyCtocMapIndex(self, value): - self._myCtocMapIndex = value - myCtocMapIndex = property(getMyCtocMapIndex, setMyCtocMapIndex, None, None) - - def addSectionParent(self, myIndex, ctoc_map_index): - # Create a new section parent - newSection = MobiSection(myIndex) - # Assign our index to the section - newSection.parentIndex = self._myIndex - # Assign section number - newSection.sectionIndex = len(self._sectionParents) - # Assign ctoc_map_index - newSection.myCtocMapIndex = ctoc_map_index - # Add it to the list - self._sectionParents.append(newSection) - return newSection - - def dumpInfo(self): - print "%20s:" % ("Periodical") - print "%20s: 0x%X" % ("myIndex", self.myIndex) - print "%20s: 0x%X" % ("startAddress", self.startAddress) - print "%20s: 0x%X" % ("length", self.length) - print "%20s: 0x%X" % ("myCtocMapIndex", self.myCtocMapIndex) - print "%20s: 0x%X" % ("firstSectionIndex", self.firstSectionIndex) - print "%20s: 0x%X" % ("lastSectionIndex", self.lastSectionIndex) - print "%20s: %d" % ("Number of Sections", len(self._sectionParents)) - for (count, section) in enumerate(self._sectionParents): - print "\t%20s: %d" % ("Section",count) - print "\t%20s: 0x%X" % ("startAddress", section.startAddress) - print "\t%20s: 0x%X" % ("length", section.sectionLength) - print "\t%20s: 0x%X" % ("parentIndex", section.parentIndex) - print "\t%20s: 0x%X" % ("myIndex", section.myIndex) - print "\t%20s: 0x%X" % ("firstArticleIndex", section.firstArticleIndex) - print "\t%20s: 0x%X" % ("lastArticleIndex", section.lastArticleIndex) - print "\t%20s: 0x%X" % ("articles", len(section.articles) ) - print "\t%20s: 0x%X" % ("myCtocMapIndex", section.myCtocMapIndex ) - print - for (artCount, article) in enumerate(section.articles) : - print "\t\t%20s: %d" % ("Article",artCount) - print "\t\t%20s: 0x%X" % ("startAddress", article.startAddress) - print "\t\t%20s: 0x%X" % ("length", article.articleLength) - print "\t\t%20s: 0x%X" % ("sectionIndex", article.sectionParentIndex) - print "\t\t%20s: 0x%X" % ("myIndex", article.myIndex) - print "\t\t%20s: 0x%X" % ("myCtocMapIndex", article.myCtocMapIndex) - print - -class MobiSection(object): - """ A container for periodical sections """ - def __init__(self, myMobiDoc): - self._myMobiDoc = myMobiDoc - self._myIndex = myMobiDoc.getNextNode() - self._parentIndex = 0xFFFFFFFF - self._firstArticleIndex = 0x00 - self._lastArticleIndex = 0x00 - self._startAddress = 0xFFFFFFFF - self._sectionLength = 0xFFFFFFFF - self._articles = [] - self._myCtocMapIndex = -1 - - def getMyMobiDoc(self): - return self._myMobiDoc - def setMyMobiDoc(self, value): - self._myMobiDoc = value - myMobiDoc = property(getMyMobiDoc, setMyMobiDoc, None, None) - - def getMyIndex(self): - return self._myIndex - def setMyIndex(self, value): - self._myIndex = value - myIndex = property(getMyIndex, setMyIndex, None, None) - - def getParentIndex(self): - return self._parentIndex - def setParentIndex(self, value): - self._parentIndex = value - parenIndex = property(getParentIndex, setParentIndex, None, None) - - def getFirstArticleIndex(self): - return self._firstArticleIndex - def setFirstArticleIndex(self, value): - self._firstArticleIndex = value - firstArticleIndex = property(getFirstArticleIndex, setFirstArticleIndex, None, None) - - def getLastArticleIndex(self): - return self._lastArticleIndex - def setLastArticleIndex(self, value): - self._lastArticleIndex = value - lastArticleIndex = property(getLastArticleIndex, setLastArticleIndex, None, None) - - def getStartAddress(self): - return self._startAddress - def setStartAddress(self, value): - self._startAddress = value - startAddress = property(getStartAddress, setStartAddress, None, None) - - def getSectionLength(self): - return self._sectionLength - def setSectionLength(self, value): - self._sectionLength = value - sectionLength = property(getSectionLength, setSectionLength, None, None) - - def getArticles(self): - return self._articles - def setArticles(self, value): - self._articles = value - articles = property(getArticles, setArticles, None, None) - - def getMyCtocMapIndex(self): - return self._myCtocMapIndex - def setMyCtocMapIndex(self, value): - self._myCtocMapIndex = value - myCtocMapIndex = property(getMyCtocMapIndex, setMyCtocMapIndex, None, None) - - def addArticle(self, article): - self._articles.append(article) - - # Adjust the Periodical parameters - # If this is the first article of the first section, init the values - if self.myIndex == 1 and len(self.articles) == 1 : - self.myMobiDoc.documentStructure.firstSectionIndex = self.myIndex - self.myMobiDoc.documentStructure.lastSectionIndex = self.myIndex - self.myMobiDoc.documentStructure.length = article.articleLength + \ - ( article.startAddress - self.myMobiDoc.documentStructure.startAddress) - else: - self.myMobiDoc.documentStructure.length += article.articleLength - - # Always set the highest section index to myIndex - self.myMobiDoc.documentStructure.lastSectionIndex = self.myIndex - - # Adjust the Section parameters - if len(self.articles) == 1 : - self.firstArticleIndex = article.myIndex - - if len(self.myMobiDoc.documentStructure.sectionParents) == 1 : - self.startAddress = self.myMobiDoc.documentStructure.startAddress - self.sectionLength = article.articleLength + \ - ( article.startAddress - self.myMobiDoc.documentStructure.startAddress ) - - else : - self.startAddress = article.startAddress - self.sectionLength = article.articleLength - - self.lastArticleIndex = article.myIndex - else : - self.lastArticleIndex = article.myIndex - - # Adjust the Section length - if len(self.articles) > 1 : - self.sectionLength += article.articleLength - -class MobiArticle(object): - """ A container for periodical articles """ - def __init__(self, sectionParent, startAddress, length, ctocMapIndex): - self._mySectionParent = sectionParent - self._myMobiDoc = sectionParent.myMobiDoc - self._myIndex = sectionParent.myMobiDoc.getNextNode() - self._myCtocMapIndex = ctocMapIndex - self._sectionParentIndex = sectionParent.myIndex - self._startAddress = startAddress - self._articleLength = length - - def getMySectionParent(self): - return self._mySectionParent - def setMySectionParent(self, value): - self._mySectionParent = value - mySectionParent = property(getMySectionParent, setMySectionParent, None, None) - - def getMyMobiDoc(self): - return self._myMobiDoc - def setMyMobiDoc(self, value): - self._myMobiDoc = value - myMobiDoc = property(getMyMobiDoc, setMyMobiDoc, None, None) - - def getMyIndex(self): - return self._myIndex - def setMyIndex(self, value): - self._sectionIndex = value - myIndex = property(getMyIndex, setMyIndex, None, None) - - def getSectionParentIndex(self): - return self._sectionParentIndex - def setSectionParentIndex(self, value): - self._sectionParentIndex = value - sectionParentIndex = property(getSectionParentIndex, setSectionParentIndex, None, None) - - def getStartAddress(self): - return self._startAddress - def setStartAddress(self, value): - self._startAddress = value - startAddress = property(getStartAddress, setStartAddress, None, None) - - def getArticleLength(self): - return self._articleLength - def setArticleLength(self, value): - self._articleLength = value - articleLength = property(getArticleLength, setArticleLength, None, None) - - def getMyCtocMapIndex(self): - return self._myCtocMapIndex - def setMyCtocMapIndex(self, value): - self._myCtocMapIndex = value - myCtocMapIndex = property(getMyCtocMapIndex, setMyCtocMapIndex, None, None) - diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 6f0c2b56e9..99321fab12 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -18,9 +18,10 @@ from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE) -from calibre.ebooks.mobi.utils import (rescale_image, encint, +from calibre.ebooks.mobi.utils import (rescale_image, encint, mobify_image, encode_trailing_data, align_block, detect_periodical) from calibre.ebooks.mobi.writer2.indexer import Indexer +from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE EXTH_CODES = { 'creator': 100, @@ -46,9 +47,6 @@ EXTH_CODES = { # Disabled as I dont care about uncrossable breaks WRITE_UNCROSSABLE_BREAKS = False -MAX_THUMB_SIZE = 16 * 1024 -MAX_THUMB_DIMEN = (180, 240) - class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') @@ -181,7 +179,11 @@ class MobiWriter(object): for item in self.oeb.manifest.values(): if item.media_type not in OEB_RASTER_IMAGES: continue try: - data = rescale_image(item.data) + data = item.data + if self.opts.mobi_keep_original_images: + data = mobify_image(data) + else: + data = rescale_image(data) except: oeb.logger.warn('Bad image file %r' % item.href) continue diff --git a/src/calibre/ebooks/oeb/transforms/rescale.py b/src/calibre/ebooks/oeb/transforms/rescale.py index e984fad38a..7f07e242af 100644 --- a/src/calibre/ebooks/oeb/transforms/rescale.py +++ b/src/calibre/ebooks/oeb/transforms/rescale.py @@ -36,7 +36,9 @@ class RescaleImages(object): ext = 'JPEG' raw = item.data - if not raw: continue + if hasattr(raw, 'xpath') or not raw: + # Probably an svg image + continue try: img = Image() img.load(raw) diff --git a/src/calibre/gui2/convert/mobi_output.py b/src/calibre/gui2/convert/mobi_output.py index cd1d0430ae..50b67008d9 100644 --- a/src/calibre/gui2/convert/mobi_output.py +++ b/src/calibre/gui2/convert/mobi_output.py @@ -21,7 +21,8 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['prefer_author_sort', 'rescale_images', 'toc_title', + ['prefer_author_sort', 'toc_title', + 'mobi_keep_original_images', 'mobi_ignore_margins', 'mobi_toc_at_start', 'dont_compress', 'no_inline_toc', 'share_not_sync', 'personal_doc']#, 'mobi_navpoints_only_deepest'] diff --git a/src/calibre/gui2/convert/mobi_output.ui b/src/calibre/gui2/convert/mobi_output.ui index 68cd55ab95..2c62b8c27a 100644 --- a/src/calibre/gui2/convert/mobi_output.ui +++ b/src/calibre/gui2/convert/mobi_output.ui @@ -6,7 +6,7 @@ <rect> <x>0</x> <y>0</y> - <width>521</width> + <width>588</width> <height>342</height> </rect> </property> @@ -14,47 +14,6 @@ <string>Form</string> </property> <layout class="QGridLayout" name="gridLayout"> - <item row="1" column="0"> - <widget class="QLabel" name="label"> - <property name="text"> - <string>&Title for Table of Contents:</string> - </property> - <property name="buddy"> - <cstring>opt_toc_title</cstring> - </property> - </widget> - </item> - <item row="1" column="1"> - <widget class="QLineEdit" name="opt_toc_title"/> - </item> - <item row="4" column="0" colspan="2"> - <widget class="QCheckBox" name="opt_rescale_images"> - <property name="text"> - <string>Rescale images for &Palm devices</string> - </property> - </widget> - </item> - <item row="5" column="0" colspan="2"> - <widget class="QCheckBox" name="opt_prefer_author_sort"> - <property name="text"> - <string>Use author &sort for author</string> - </property> - </widget> - </item> - <item row="6" column="0"> - <widget class="QCheckBox" name="opt_dont_compress"> - <property name="text"> - <string>Disable compression of the file contents</string> - </property> - </widget> - </item> - <item row="0" column="0"> - <widget class="QCheckBox" name="opt_no_inline_toc"> - <property name="text"> - <string>Do not add Table of Contents to book</string> - </property> - </widget> - </item> <item row="8" column="0" colspan="2"> <widget class="QGroupBox" name="groupBox"> <property name="title"> @@ -125,6 +84,47 @@ </property> </widget> </item> + <item row="4" column="0" colspan="2"> + <widget class="QCheckBox" name="opt_prefer_author_sort"> + <property name="text"> + <string>Use author &sort for author</string> + </property> + </widget> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label"> + <property name="text"> + <string>&Title for Table of Contents:</string> + </property> + <property name="buddy"> + <cstring>opt_toc_title</cstring> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QLineEdit" name="opt_toc_title"/> + </item> + <item row="6" column="0"> + <widget class="QCheckBox" name="opt_dont_compress"> + <property name="text"> + <string>Disable compression of the file contents</string> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QCheckBox" name="opt_no_inline_toc"> + <property name="text"> + <string>Do not add Table of Contents to book</string> + </property> + </widget> + </item> + <item row="5" column="0" colspan="2"> + <widget class="QCheckBox" name="opt_mobi_keep_original_images"> + <property name="text"> + <string>Do not convert all images to &JPEG (may result in images not working in older viewers)</string> + </property> + </widget> + </item> </layout> </widget> <resources/> diff --git a/src/calibre/gui2/library/delegates.py b/src/calibre/gui2/library/delegates.py index eea3625a2a..81d25c1f5e 100644 --- a/src/calibre/gui2/library/delegates.py +++ b/src/calibre/gui2/library/delegates.py @@ -5,11 +5,14 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import sys + from PyQt4.Qt import (Qt, QApplication, QStyle, QIcon, QDoubleSpinBox, QVariant, QSpinBox, QStyledItemDelegate, QComboBox, QTextDocument, QAbstractTextDocumentLayout, QFont, QFontInfo) from calibre.gui2 import UNDEFINED_QDATETIME, error_dialog, rating_font +from calibre.constants import iswindows from calibre.gui2.widgets import EnLineEdit from calibre.gui2.complete import MultiCompleteLineEdit, MultiCompleteComboBox from calibre.utils.date import now, format_date, qt_to_dt @@ -27,7 +30,10 @@ class RatingDelegate(QStyledItemDelegate): # {{{ QStyledItemDelegate.__init__(self, *args, **kwargs) self.rf = QFont(rating_font()) self.em = Qt.ElideMiddle - self.rf.setPointSize(QFontInfo(QApplication.font()).pointSize()) + delta = 0 + if iswindows and sys.getwindowsversion().major >= 6: + delta = 2 + self.rf.setPointSize(QFontInfo(QApplication.font()).pointSize()+delta) def createEditor(self, parent, option, index): sb = QStyledItemDelegate.createEditor(self, parent, option, index) diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index fd43cd79ad..3158241f28 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -170,7 +170,7 @@ <item row="8" column="0" colspan="2"> <widget class="QCheckBox" name="opt_remember_window_size"> <property name="text"> - <string>Remember last used &window size</string> + <string>Remember last used &window size and layout</string> </property> </widget> </item> diff --git a/src/calibre/library/coloring.py b/src/calibre/library/coloring.py index e1955077b3..4847a48c7d 100644 --- a/src/calibre/library/coloring.py +++ b/src/calibre/library/coloring.py @@ -117,7 +117,10 @@ class Rule(object): # {{{ 'lt': ('1', '', ''), 'gt': ('', '', '1') }[action] - return "cmp(raw_field('%s'), %s, '%s', '%s', '%s')" % (col, val, lt, eq, gt) + if col == 'size': + return "cmp(booksize(), %s, '%s', '%s', '%s')" % (val, lt, eq, gt) + else: + return "cmp(raw_field('%s'), %s, '%s', '%s', '%s')" % (col, val, lt, eq, gt) def rating_condition(self, col, action, val): lt, eq, gt = { diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py index 4c5ade37b0..453f03f38a 100644 --- a/src/calibre/library/custom_columns.py +++ b/src/calibre/library/custom_columns.py @@ -227,6 +227,25 @@ class CustomColumns(object): return self.conn.get('''SELECT extra FROM %s WHERE book=?'''%lt, (idx,), all=False) + def get_custom_and_extra(self, idx, label=None, num=None, index_is_id=False): + if label is not None: + data = self.custom_column_label_map[label] + if num is not None: + data = self.custom_column_num_map[num] + idx = idx if index_is_id else self.id(idx) + row = self.data._data[idx] + ans = row[self.FIELD_MAP[data['num']]] + if data['is_multiple'] and data['datatype'] == 'text': + ans = ans.split(data['multiple_seps']['cache_to_list']) if ans else [] + if data['display'].get('sort_alpha', False): + ans.sort(cmp=lambda x,y:cmp(x.lower(), y.lower())) + if data['datatype'] != 'series': + return (ans, None) + ign,lt = self.custom_table_names(data['num']) + extra = self.conn.get('''SELECT extra FROM %s + WHERE book=?'''%lt, (idx,), all=False) + return (ans, extra) + # convenience methods for tag editing def get_custom_items_with_ids(self, label=None, num=None): if label is not None: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 555d91b7df..bcc4b05399 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -910,7 +910,15 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): Convenience method to return metadata as a :class:`Metadata` object. Note that the list of formats is not verified. ''' - row = self.data._data[idx] if index_is_id else self.data[idx] + idx = idx if index_is_id else self.id(idx) + try: + row = self.data._data[idx] + except: + row = None + + if row is None: + raise ValueError('No book with id: %d'%idx) + fm = self.FIELD_MAP mi = Metadata(None, template_cache=self.formatter_template_cache) @@ -948,14 +956,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): mi.book_size = row[fm['size']] mi.ondevice_col= row[fm['ondevice']] mi.last_modified = row[fm['last_modified']] - id = idx if index_is_id else self.id(idx) formats = row[fm['formats']] mi.format_metadata = {} if not formats: good_formats = None else: formats = sorted(formats.split(',')) - mi.format_metadata = FormatMetadata(self, id, formats) + mi.format_metadata = FormatMetadata(self, idx, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats tags = row[fm['tags']] @@ -968,19 +975,18 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if mi.series: mi.series_index = row[fm['series_index']] mi.rating = row[fm['rating']] - mi.set_identifiers(self.get_identifiers(id, index_is_id=True)) - mi.application_id = id - mi.id = id + mi.set_identifiers(self.get_identifiers(idx, index_is_id=True)) + mi.application_id = idx + mi.id = idx + mi.set_all_user_metadata(self.field_metadata.custom_field_metadata()) for key, meta in self.field_metadata.custom_iteritems(): - mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': mi.set(key, val=row[meta['rec_index']]) else: - mi.set(key, val=self.get_custom(idx, label=meta['label'], - index_is_id=index_is_id), - extra=self.get_custom_extra(idx, label=meta['label'], - index_is_id=index_is_id)) + val, extra = self.get_custom_and_extra(idx, label=meta['label'], + index_is_id=True) + mi.set(key, val=val, extra=extra) user_cats = self.prefs['user_categories'] user_cat_vals = {} @@ -999,12 +1005,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if get_cover: if cover_as_data: - cdata = self.cover(id, index_is_id=True) + cdata = self.cover(idx, index_is_id=True) if cdata: mi.cover_data = ('jpeg', cdata) else: - mi.cover = self.cover(id, index_is_id=True, as_path=True) - mi.has_cover = _('Yes') if self.has_cover(id) else '' + mi.cover = self.cover(idx, index_is_id=True, as_path=True) + mi.has_cover = _('Yes') if self.has_cover(idx) else '' return mi def has_book(self, mi): diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py index 5b81f4a6a6..58914e7572 100644 --- a/src/calibre/library/field_metadata.py +++ b/src/calibre/library/field_metadata.py @@ -388,6 +388,7 @@ class FieldMetadata(dict): def __init__(self): self._field_metadata = copy.deepcopy(self._field_metadata_prototype) self._tb_cats = OrderedDict() + self._tb_custom_fields = {} self._search_term_map = {} self.custom_label_to_key_map = {} for k,v in self._field_metadata: @@ -477,10 +478,8 @@ class FieldMetadata(dict): yield (key, self._tb_cats[key]) def custom_iteritems(self): - for key in self._tb_cats: - fm = self._tb_cats[key] - if fm['is_custom']: - yield (key, self._tb_cats[key]) + for key, meta in self._tb_custom_fields.iteritems(): + yield (key, meta) def items(self): return list(self.iteritems()) @@ -516,6 +515,8 @@ class FieldMetadata(dict): return l def custom_field_metadata(self, include_composites=True): + if include_composites: + return self._tb_custom_fields l = {} for k in self.custom_field_keys(include_composites): l[k] = self._tb_cats[k] @@ -537,6 +538,7 @@ class FieldMetadata(dict): 'is_custom':True, 'is_category':is_category, 'link_column':'value','category_sort':'value', 'is_csp' : is_csp, 'is_editable': is_editable,} + self._tb_custom_fields[key] = self._tb_cats[key] self._add_search_terms_to_map(key, [key]) self.custom_label_to_key_map[label] = key if datatype == 'series': diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index f6e293013e..a6d1467cab 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -29,6 +29,7 @@ It can convert every input format in the following list, to every output format. PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers. PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files. DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software. + MOBI books can be of two types Mobi6 and KF8. |app| currently fully supports Mobi6 and supports conversion from, but not to, KF8 .. _best-source-formats: diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst index 555a29b269..0f6b912418 100644 --- a/src/calibre/manual/template_lang.rst +++ b/src/calibre/manual/template_lang.rst @@ -57,7 +57,7 @@ For example, assume you want to use the template:: {series} - {series_index} - {title} -If the book has no series, the answer will be ``- - title``. Many people would rather the result be simply ``title``, without the hyphens. To do this, use the extended syntax ``{field:|prefix_text|suffix_text}``. When you use this syntax, if field has the value SERIES then the result will be ``prefix_textSERIESsuffix_text``. If field has no value, then the result will be the empty string (nothing); the prefix and suffix are ignored. The prefix and suffix can contain blanks. +If the book has no series, the answer will be ``- - title``. Many people would rather the result be simply ``title``, without the hyphens. To do this, use the extended syntax ``{field:|prefix_text|suffix_text}``. When you use this syntax, if field has the value SERIES then the result will be ``prefix_textSERIESsuffix_text``. If field has no value, then the result will be the empty string (nothing); the prefix and suffix are ignored. The prefix and suffix can contain blanks. **Do not use subtemplates (`{ ... }`) or functions (see below) as the prefix or the suffix.** Using this syntax, we can solve the above series problem with the template:: @@ -65,7 +65,7 @@ Using this syntax, we can solve the above series problem with the template:: The hyphens will be included only if the book has a series index, which it will have only if it has a series. -Notes: you must include the : character if you want to use a prefix or a suffix. You must either use no \| characters or both of them; using one, as in ``{field:| - }``, is not allowed. It is OK not to provide any text for one side or the other, such as in ``{series:|| - }``. Using ``{title:||}`` is the same as using ``{title}``. +Notes: you must include the : character if you want to use a prefix or a suffix. You must either use no \| characters or both of them; using one, as in ``{field:| - }``, is not allowed. It is OK not to provide any text for one side or the other, such as in ``{series:|| - }``. Using ``{title:||}`` is the same as using ``{title}``. Second: formatting. Suppose you wanted to ensure that the series_index is always formatted as three digits with leading zeros. This would do the trick:: @@ -112,7 +112,7 @@ Functions are always applied before format specifications. See further down for The syntax for using functions is ``{field:function(arguments)}``, or ``{field:function(arguments)|prefix|suffix}``. Arguments are separated by commas. Commas inside arguments must be preceeded by a backslash ( '\\' ). The last (or only) argument cannot contain a closing parenthesis ( ')' ). Functions return the value of the field used in the template, suitably modified. -If you have programming experience, please note that the syntax in this mode (single function) is not what you might expect. Strings are not quoted. Spaces are significant. All arguments must be constants; there is no sub-evaluation. **Do not use subtemplates (`{ ... }`) as function arguments.** Instead, use :ref:`template program mode <template_mode>` and :ref:`general program mode <general_mode>`. +Important: If you have programming experience, please note that the syntax in this mode (single function) is not what you might expect. Strings are not quoted. Spaces are significant. All arguments must be constants; there is no sub-evaluation. **Do not use subtemplates (`{ ... }`) as function arguments.** Instead, use :ref:`template program mode <template_mode>` and :ref:`general program mode <general_mode>`. Many functions use regular expressions. In all cases, regular expression matching is case-insensitive. @@ -245,6 +245,7 @@ The following functions are available in addition to those described in single-f * ``current_library_name() -- `` return the last name on the path to the current calibre library. This function can be called in template program mode using the template ``{:'current_library_name()'}``. * ``days_between(date1, date2)`` -- return the number of days between ``date1`` and ``date2``. The number is positive if ``date1`` is greater than ``date2``, otherwise negative. If either ``date1`` or ``date2`` are not dates, the function returns the empty string. * ``divide(x, y)`` -- returns x / y. Throws an exception if either x or y are not numbers. + * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. * ``field(name)`` -- returns the metadata field named by ``name``. * ``first_non_empty(value, value, ...)`` -- returns the first value that is not empty. If all values are empty, then the empty value is returned. You can have as many values as you want. * ``format_date(x, date_format)`` -- format_date(val, format_string) -- format the value, which must be a date field, using the format_string, returning a string. The formatting codes are:: @@ -269,7 +270,19 @@ The following functions are available in addition to those described in single-f AP : use a 12-hour clock instead of a 24-hour clock, with 'AP' replaced by the localized string for AM or PM. iso : the date with time and timezone. Must be the only format present. - * ``eval(string)`` -- evaluates the string as a program, passing the local variables (those ``assign`` ed to). This permits using the template processor to construct complex results from local variables. + * finish_formatting(val, fmt, prefix, suffix) -- apply the format, prefix, and suffix to a value in the same way as done in a template like ``{series_index:05.2f| - |- }``. This function is provided to ease conversion of complex single-function- or template-program-mode templates to :ref:`general program mode <general_mode>` (see below) to take advantage of GPM template compilation. For example, the following program produces the same output as the above template:: + + program: finish_formatting(field("series_index"), "05.2f", " - ", " - ") + + Another example: for the template ``{series:re(([^\s])[^\s]+(\s|$),\1)}{series_index:0>2s| - | - }{title}`` use:: + + program: + strcat( + re(field('series'), '([^\s])[^\s]+(\s|$)', '\1'), + finish_formatting(field('series_index'), '0>2s', ' - ', ' - '), + field('title') + ) + * ``formats_modtimes(date_format)`` -- return a comma-separated list of colon_separated items representing modification times for the formats of a book. The date_format parameter specifies how the date is to be formatted. See the date_format function for details. You can use the select function to get the mod time for a specific format. Note that format names are always uppercase, as in EPUB. * ``formats_sizes()`` -- return a comma-separated list of colon_separated items representing sizes in bytes of the formats of a book. You can use the select function to get the size for a specific format. Note that format names are always uppercase, as in EPUB. * ``has_cover()`` -- return ``Yes`` if the book has a cover, otherwise return the empty string @@ -312,7 +325,7 @@ Using general program mode For more complicated template programs, it is sometimes easier to avoid template syntax (all the `{` and `}` characters), instead writing a more classical-looking program. You can do this in |app| by beginning the template with `program:`. In this case, no template processing is done. The special variable `$` is not set. It is up to your program to produce the correct results. -One advantage of `program:` mode is that the brackets are no longer special. For example, it is not necessary to use `[[` and `]]` when using the `template()` function. +One advantage of `program:` mode is that the brackets are no longer special. For example, it is not necessary to use `[[` and `]]` when using the `template()` function. Another advantage is that program mode templates are compiled to Python and can run much faster than templates in the other two modes. Speed improvement depends on the complexity of the templates; the more complicated the template the more the improvement. Compilation is turned off or on using the tweak ``compile_gpm_templates`` (Compile General Program Mode templates to Python). The main reason to turn off compilation is if a compiled template does not work, in which case please file a bug report. The following example is a `program:` mode implementation of a recipe on the MobileRead forum: "Put series into the title, using either initials or a shortened form. Strip leading articles from the series name (any)." For example, for the book The Two Towers in the Lord of the Rings series, the recipe gives `LotR [02] The Two Towers`. Using standard templates, the recipe requires three custom columns and a plugboard, as explained in the following: diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py index b1224de3da..2be42bc0ee 100644 --- a/src/calibre/utils/formatter.py +++ b/src/calibre/utils/formatter.py @@ -11,7 +11,8 @@ __docformat__ = 'restructuredtext en' import re, string, traceback from calibre.constants import DEBUG -from calibre.utils.formatter_functions import formatter_functions +from calibre.utils.formatter_functions import formatter_functions, compile_user_function +from calibre.utils.config import tweaks class _Parser(object): LEX_OP = 1 @@ -172,6 +173,138 @@ class _Parser(object): self.error(_('expression is not function or constant')) +class _CompileParser(_Parser): + def __init__(self, val, prog, parent, compile_text): + self.lex_pos = 0 + self.prog = prog[0] + self.prog_len = len(self.prog) + if prog[1] != '': + self.error(_('failed to scan program. Invalid input {0}').format(prog[1])) + self.parent = parent + parent.locals = {'$':val} + self.parent_kwargs = parent.kwargs + self.parent_book = parent.book + self.parent_locals = parent.locals + self.compile_text = compile_text + + def program(self): + if self.compile_text: + t = self.compile_text + self.compile_text = '\n' + self.max_level = 0 + val = self.statement() + if not self.token_is_eof(): + self.error(_('syntax error - program ends before EOF')) + if self.compile_text: + t += "\targs=[[]" + for i in range(0, self.max_level): + t += ", None" + t += ']' + self.compile_text = t + self.compile_text + "\treturn args[0][0]\n" + return val + + def statement(self, level=0): + while True: + val = self.expr(level) + if self.token_is_eof(): + return val + if not self.token_op_is_a_semicolon(): + return val + if self.compile_text: + self.compile_text += "\targs[%d] = list()\n"%(level,) + self.consume() + if self.token_is_eof(): + return val + + def expr(self, level): + if self.compile_text: + self.max_level = max(level, self.max_level) + + if self.token_is_id(): + funcs = formatter_functions().get_functions() + # We have an identifier. Determine if it is a function + id = self.token() + if not self.token_op_is_a_lparen(): + if self.token_op_is_a_equals(): + # classic assignment statement + self.consume() + cls = funcs['assign'] + if self.compile_text: + self.compile_text += '\targs[%d] = list()\n'%(level+1,) + val = cls.eval_(self.parent, self.parent_kwargs, + self.parent_book, self.parent_locals, id, self.expr(level+1)) + if self.compile_text: + self.compile_text += "\tlocals['%s'] = args[%d][0]\n"%(id, level+1) + self.compile_text += "\targs[%d].append(args[%d][0])\n"%(level, level+1) + return val + val = self.parent.locals.get(id, None) + if val is None: + self.error(_('Unknown identifier ') + id) + if self.compile_text: + self.compile_text += "\targs[%d].append(locals.get('%s'))\n"%(level, id) + return val + # We have a function. + # Check if it is a known one. We do this here so error reporting is + # better, as it can identify the tokens near the problem. + if id not in funcs: + self.error(_('unknown function {0}').format(id)) + + # Eat the paren + self.consume() + args = list() + if self.compile_text: + self.compile_text += '\targs[%d] = list()\n'%(level+1, ) + if id == 'field': + val = self.expr(level+1) + val = self.parent.get_value(val, [], self.parent_kwargs) + if self.compile_text: + self.compile_text += "\targs[%d].append(formatter.get_value(args[%d][0], [], kwargs))\n"%(level, level+1) + if self.token() != ')': + self.error(_('missing closing parenthesis')) + return val + while not self.token_op_is_a_rparen(): + if id == 'assign' and len(args) == 0: + # Must handle the lvalue semantics of the assign function. + # The first argument is the name of the destination, not + # the value. + if not self.token_is_id(): + self.error('assign requires the first parameter be an id') + t = self.token() + args.append(t) + if self.compile_text: + self.compile_text += "\targs[%d].append('%s')\n"%(level+1, t) + else: + # evaluate the argument (recursive call) + args.append(self.statement(level=level+1)) + if not self.token_op_is_a_comma(): + break + self.consume() + if self.token() != ')': + self.error(_('missing closing parenthesis')) + + # Evaluate the function + cls = funcs[id] + if cls.arg_count != -1 and len(args) != cls.arg_count: + self.error('incorrect number of arguments for function {}'.format(id)) + if self.compile_text: + self.compile_text += ( + "\targs[%d].append(self.__funcs__['%s']" + ".evaluate(formatter, kwargs, book, locals, *args[%d]))\n")%(level, id, level+1) + return cls.eval_(self.parent, self.parent_kwargs, + self.parent_book, self.parent_locals, *args) + elif self.token_is_constant(): + # String or number + v = self.token() + if self.compile_text: + tv = v.replace("\\", "\\\\") + tv = tv.replace("'", "\\'") + self.compile_text += "\targs[%d].append('%s')\n"%(level, tv) + return v + else: + self.error(_('expression is not function or constant')) + +compile_counter = 0 + class TemplateFormatter(string.Formatter): ''' Provides a format function that substitutes '' for any missing value @@ -249,15 +382,36 @@ class TemplateFormatter(string.Formatter): # keep a cache of the lex'ed program under the theory that re-lexing # is much more expensive than the cache lookup. This is certainly true # for more than a few tokens, but it isn't clear for simple programs. - if column_name is not None and self.template_cache is not None: - lprog = self.template_cache.get(column_name, None) - if not lprog: + if tweaks['compile_gpm_templates']: + if column_name is not None and self.template_cache is not None: + lprog = self.template_cache.get(column_name, None) + if lprog: + return lprog.evaluate(self, self.kwargs, self.book, self.locals) lprog = self.lex_scanner.scan(prog) - self.template_cache[column_name] = lprog + compile_text = ('__funcs__ = formatter_functions().get_functions()\n' + 'def evaluate(self, formatter, kwargs, book, locals):\n' + ) + else: + lprog = self.lex_scanner.scan(prog) + compile_text = None + parser = _CompileParser(val, lprog, self, compile_text) + val = parser.program() + if parser.compile_text: + global compile_counter + compile_counter += 1 + f = compile_user_function("__A" + str(compile_counter), 'doc', -1, parser.compile_text) + self.template_cache[column_name] = f else: + if column_name is not None and self.template_cache is not None: + lprog = self.template_cache.get(column_name, None) + if not lprog: + lprog = self.lex_scanner.scan(prog) + self.template_cache[column_name] = lprog + else: lprog = self.lex_scanner.scan(prog) - parser = _Parser(val, lprog, self) - return parser.program() + parser = _Parser(val, lprog, self) + val = parser.program() + return val ################## Override parent classes methods ##################### diff --git a/src/calibre/utils/formatter_functions.py b/src/calibre/utils/formatter_functions.py index ec887887db..c4eb80d3e0 100644 --- a/src/calibre/utils/formatter_functions.py +++ b/src/calibre/utils/formatter_functions.py @@ -11,6 +11,7 @@ __docformat__ = 'restructuredtext en' import inspect, re, traceback from calibre import human_readable +from calibre.constants import DEBUG from calibre.utils.titlecase import titlecase from calibre.utils.icu import capitalize, strcmp, sort_key from calibre.utils.date import parse_date, format_date, now, UNDEFINED_DATE @@ -1118,12 +1119,28 @@ class BuiltinCurrentLibraryName(BuiltinFormatterFunction): from calibre.library import current_library_name return current_library_name() +class BuiltinFinishFormatting(BuiltinFormatterFunction): + name = 'finish_formatting' + arg_count = 4 + category = 'Formatting values' + __doc__ = doc = _('finish_formatting(val, fmt, prefix, suffix) -- apply the ' + 'format, prefix, and suffix to a value in the same way as ' + 'done in a template like {series_index:05.2f| - |- }. For ' + 'example, the following program produces the same output ' + 'as the above template: ' + 'program: finish_formatting(field("series_index"), "05.2f", " - ", " - ")') + + def evaluate(self, formatter, kwargs, mi, locals_, val, fmt, prefix, suffix): + if not val: + return val + return prefix + formatter._do_format(val, fmt) + suffix + _formatter_builtins = [ BuiltinAdd(), BuiltinAnd(), BuiltinAssign(), BuiltinBooksize(), BuiltinCapitalize(), BuiltinCmp(), BuiltinContains(), BuiltinCount(), BuiltinCurrentLibraryName(), - BuiltinDaysBetween(), BuiltinDivide(), BuiltinEval(), - BuiltinFirstNonEmpty(), BuiltinField(), BuiltinFormatDate(), + BuiltinDaysBetween(), BuiltinDivide(), BuiltinEval(), BuiltinFirstNonEmpty(), + BuiltinField(), BuiltinFinishFormatting(), BuiltinFormatDate(), BuiltinFormatNumber(), BuiltinFormatsModtimes(), BuiltinFormatsSizes(), BuiltinHasCover(), BuiltinHumanReadable(), BuiltinIdentifierInList(), BuiltinIfempty(), BuiltinLanguageCodes(), BuiltinLanguageStrings(), @@ -1156,11 +1173,14 @@ def compile_user_function(name, doc, arg_count, eval_func): for line in eval_func.splitlines()]) prog = ''' from calibre.utils.formatter_functions import FormatterUserFunction +from calibre.utils.formatter_functions import formatter_functions class UserFunction(FormatterUserFunction): ''' + func - locals = {} - exec prog in locals - cls = locals['UserFunction'](name, doc, arg_count, eval_func) + locals_ = {} + if DEBUG: + print prog + exec prog in locals_ + cls = locals_['UserFunction'](name, doc, arg_count, eval_func) return cls def load_user_template_functions(funcs): diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 9993d9a3db..03ce64a750 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -14,7 +14,7 @@ from PIL import Image from cStringIO import StringIO from calibre import browser, relpath, unicode_path -from calibre.constants import filesystem_encoding +from calibre.constants import filesystem_encoding, iswindows from calibre.utils.filenames import ascii_filename from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.chardet import xml_to_unicode @@ -213,6 +213,8 @@ class RecursiveFetcher(object): is_local = 5 if is_local > 0: url = url[is_local:] + if iswindows and url.startswith('/'): + url = url[1:] with open(url, 'rb') as f: data = response(f.read()) data.newurl = 'file:'+url # This is what mechanize does for