diff --git a/setup.py b/setup.py index 407b852a57..ee2d54cc5a 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,9 @@ if __name__ == '__main__': library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)], include_dirs=\ [os.environ.get('PODOFO_INC_DIR', podofo_inc)])) + else: + print 'WARNING: PoDoFo not found on your system. Various PDF related', + print 'functionality will not work.' ext_modules = optional + [ diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index 026dbdc165..1c1e4795b6 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -27,6 +27,15 @@ class MOBIOutput(OutputFormatPlugin): OptionRecommendation(name='toc_title', recommended_value=None, help=_('Title for any generated in-line table of contents.') ), + OptionRecommendation(name='mobi_periodical', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('When present, generate a periodical rather than a book.') + ), + OptionRecommendation(name='no_mobi_index', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Disable generation of MOBI index.') + ), + ]) recommendations = set([ @@ -49,7 +58,7 @@ class MOBIOutput(OutputFormatPlugin): rasterizer(oeb, opts) mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) - writer = MobiWriter(imagemax=imagemax, + writer = MobiWriter(opts, imagemax=imagemax, prefer_author_sort=opts.prefer_author_sort) writer(oeb, output_path) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index e93eb1ae27..19e997f258 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -3,7 +3,8 @@ Write content to Mobipocket books. ''' __license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' +__copyright__ = '2008, Marshall T. Vandegrift and \ + Kovid Goyal ' from collections import defaultdict from itertools import count @@ -57,6 +58,25 @@ OTHER_MAX_IMAGE_SIZE = 10 * 1024 * 1024 MAX_THUMB_SIZE = 16 * 1024 MAX_THUMB_DIMEN = (180, 240) + +TAGX = { + 'chapter' : + '\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x00\x00\x00\x01', + 'subchapter' : + '\x00\x00\x00\x01\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x10\x00\x16\x01\x20\x00\x17\x01\x40\x00\x00\x00\x00\x01', + 'periodical' : + '\x00\x00\x00\x02\x01\x01\x01\x00\x02\x01\x02\x00\x03\x01\x04\x00\x04\x01\x08\x00\x05\x01\x10\x00\x15\x01\x20\x00\x16\x01\x40\x00\x17\x01\x80\x00\x00\x00\x00\x01\x45\x01\x01\x00\x46\x01\x02\x00\x47\x01\x04\x00\x00\x00\x00\x01' + } + +INDXT = { + 'chapter' : '\x0f', + 'subchapter' : '\x1f', + 'article' : '\x3f', + 'chapter with subchapters': '\x6f', + 'periodical' : '\xdf', + 'section' : '\xff', + } + def encode(data): return data.encode('utf-8') @@ -202,13 +222,11 @@ class Serializer(object): def serialize_item(self, item): buffer = self.buffer - #buffer.write('') if not item.linear: self.breaks.append(buffer.tell() - 1) self.id_offsets[item.href] = buffer.tell() for elem in item.data.find(XHTML('body')): self.serialize_elem(elem, item) - #buffer.write('') buffer.write('') def serialize_elem(self, elem, item, nsrmap=NSRMAP): @@ -288,11 +306,13 @@ class Serializer(object): class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') - def __init__(self, compression=PALMDOC, imagemax=None, + def __init__(self, opts, compression=PALMDOC, imagemax=None, prefer_author_sort=False): + self.opts = opts self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort + self._primary_index_record = None @classmethod def generate(cls, opts): @@ -327,6 +347,8 @@ class MobiWriter(object): def _generate_content(self): self._map_image_names() self._generate_text() + if not self.opts.no_mobi_index: + self._generate_index() self._generate_images() def _map_image_names(self): @@ -372,6 +394,8 @@ class MobiWriter(object): serializer = Serializer(self._oeb, self._images) breaks = serializer.breaks text = serializer.text + self._id_offsets = serializer.id_offsets + self._content_length = len(text) self._text_length = len(text) text = StringIO(text) nrecords = 0 @@ -408,10 +432,205 @@ class MobiWriter(object): data, overlap = self._read_text_record(text) self._text_nrecords = nrecords + def _generate_indxt(self, ctoc): + if self.opts.mobi_periodical: + raise NotImplementedError('Indexing for periodicals not implemented') + toc = self._oeb.toc + indxt, indices, c = StringIO(), StringIO(), 0 + + indices.write('INDX') + c = 0 + last_index = last_name = None + + def add_node(node, offset, length, count): + t = node.title + if self.opts.verbose > 2: + self._oeb.log.debug('Adding TOC node:', node.title, 'href:', + node.href) + + pos = 0xc0 + indxt.tell() + indices.write(pack('>H', pos)) + indxt.write(chr(len(str(count)))+str(count)) + indxt.write(INDXT['chapter']) + indxt.write(decint(offset, DECINT_FORWARD)) + indxt.write(decint(length, DECINT_FORWARD)) + indxt.write(decint(self._ctoc_map[node], DECINT_FORWARD)) + indxt.write(decint(0, DECINT_FORWARD)) + + + entries = list(toc.iter())[1:] + for i, child in enumerate(entries): + if not child.title or not child.title.strip(): + continue + h = child.href + if h not in self._id_offsets: + self._oeb.log.warning('Could not find TOC entry:', child.title) + continue + offset = self._id_offsets[h] + length = None + for sibling in entries[i+1:]: + h2 = sibling.href + if h2 in self._id_offsets: + offset2 = self._id_offsets[h2] + if offset2 > offset: + length = offset2 - offset + break + if length is None: + length = self._content_length - offset + + add_node(child, offset, length, c) + last_index = c + ctoc_offset = self._ctoc_map[child] + last_name = self._ctoc_name_map[child] + c += 1 + + return indxt.getvalue(), c, indices.getvalue(), last_index, last_name + + + def _generate_index(self): + self._oeb.log('Generating index...') + self._primary_index_record = None + ctoc = self._generate_ctoc() + indxt, indxt_count, indices, last_index, last_name = \ + self._generate_indxt(ctoc) + + indx1 = StringIO() + indx1.write('INDX'+pack('>I', 0xc0)) # header length + + # 0x8 - 0xb : Unknown + indx1.write('\0'*4) + + # 0xc - 0xf : Header type + indx1.write(pack('>I', 1)) + + # 0x10 - 0x13 : Unknown + indx1.write('\0'*4) + + # 0x14 - 0x17 : IDXT offset + # 0x18 - 0x1b : IDXT count + indx1.write(pack('>I', 0xc0+len(indxt))) + indx1.write(pack('>I', indxt_count)) + + # 0x1c - 0x23 : Unknown + indx1.write('\xff'*8) + + # 0x24 - 0xbf + indx1.write('\0'*156) + indx1.write(indxt) + indx1.write(indices) + indx1 = indx1.getvalue() + + idxt0 = last_name + pack('>H', last_index) + indx0 = StringIO() + + tagx = TAGX['periodical' if self.opts.mobi_periodical else 'chapter'] + tagx = 'TAGX' + pack('>I', 8 + len(tagx)) + tagx + indx0_indices_pos = 0xc0 + len(tagx) + len(idxt0) + indx0_indices = 'INDX' + pack('>H', 0xc0 + len(tagx)) + # Generate record header + header = StringIO() + + header.write('INDX') + header.write(pack('>I', 0xc0)) # header length + + # 0x08 - 0x0b : Unknown + header.write('\0'*4) + + # 0x0c - 0x0f : Header type + header.write(pack('>I', 0)) + + # 0x10 - 0x13 : Generator ID + header.write(pack('>I', 6)) + + # 0x14 - 0x17 : IDXT offset + header.write(pack('>I', indx0_indices_pos)) + + # 0x18 - 0x1b : IDXT count + header.write(pack('>I', 1)) + + # 0x1c - 0x1f : Text encoding ? + header.write(pack('>I', 650001)) + + # 0x20 - 0x23 : Language code? + header.write(iana2mobi(str(self._oeb.metadata.language[0]))) + + # 0x24 - 0x27 : Number of TOC entries in INDX1 + header.write(pack('>I', indxt_count)) + + # 0x28 - 0x2b : ORDT Offset + header.write('\0'*4) + + # 0x2c - 0x2f : LIGT offset + header.write('\0'*4) + + # 0x30 - 0x33 : Number of LIGT entries + header.write('\0'*4) + + # 0x34 - 0x37 : Unknown + header.write(pack('>I', 1)) + + # 0x38 - 0xb3 : Unknown (pad?) + header.write('\0'*124) + + # 0xb4 - 0xb7 : TAGX offset + header.write(pack('>I', 0xc0)) + + # 0xb8 - 0xbf : Unknown + header.write('\0'*8) + + header = header.getvalue() + + indx0.write(header) + indx0.write(tagx) + indx0.write(idxt0) + indx0.write(indx0_indices) + indx0 = indx0.getvalue() + + self._primary_index_record = len(self._records) + if self.opts.verbose > 3: + from tempfile import mkdtemp + import os + t = mkdtemp() + open(os.path.join(t, 'indx0.bin'), 'wb').write(indx0) + open(os.path.join(t, 'indx1.bin'), 'wb').write(indx1) + open(os.path.join(t, 'ctoc.bin'), 'wb').write(ctoc) + self._oeb.log.debug('Index records dumped to', t) + + self._records.extend([indx0, indx1, ctoc]) + + def _generate_ctoc(self): + if self.opts.mobi_periodical: + raise NotImplementedError('Indexing for periodicals not implemented') + toc = self._oeb.toc + self._ctoc_map = {} + self._ctoc_name_map = {} + self._last_toc_entry = None + ctoc = StringIO() + + def add_node(node, cls): + t = node.title + if t and t.strip(): + t = t.strip() + if not isinstance(t, unicode): + t = t.decode('utf-8', 'replace') + t = t.encode('utf-8') + self._last_toc_entry = t + self._ctoc_map[node] = ctoc.tell() + self._ctoc_name_map[node] = decint(len(t), DECINT_FORWARD)+t + ctoc.write(self._ctoc_name_map[node]) + + for child in toc.iter(): + add_node(child, 'chapter') + + return ctoc.getvalue() + + + def _generate_images(self): self._oeb.logger.info('Serializing images...') images = [(index, href) for href, index in self._images.items()] images.sort() + self._first_image_record = None for _, href in images: item = self._oeb.manifest.hrefs[href] try: @@ -420,6 +639,8 @@ class MobiWriter(object): self._oeb.logger.warn('Bad image file %r' % item.href) continue self._records.append(data) + if self._first_image_record is None: + self._first_image_record = len(self._records)-1 def _generate_record0(self): metadata = self._oeb.metadata @@ -446,8 +667,9 @@ class MobiWriter(object): # 0xC - 0xF : Text encoding (65001 is utf-8) # 0x10 - 0x13 : UID # 0x14 - 0x17 : Generator version + btype = 0x101 if self.opts.mobi_periodical else 2 record0.write(pack('>IIIII', - 0xe8, 2, 65001, uid, 6)) + 0xe8, btype, 65001, uid, 6)) # 0x18 - 0x1f : Unknown record0.write('\xff' * 8) @@ -477,7 +699,7 @@ class MobiWriter(object): # 0x58 - 0x5b : Format version # 0x5c - 0x5f : First image record number record0.write(pack('>II', - 6, self._text_nrecords + 1)) + 6, self._first_image_record if self._first_image_record else 0)) # 0x60 - 0x63 : First HUFF/CDIC record number # 0x64 - 0x67 : Number of HUFF/CDIC records @@ -537,8 +759,8 @@ class MobiWriter(object): record0.write(pack('>I', 5)) # 0xe4 - 0xe7 : Primary index record - # TODO: Implement - record0.write(pack('>I', 0xffffffff)) + record0.write(pack('>I', 0xffffffff if self._primary_index_record is + None else self._primary_index_record)) record0.write(exth) record0.write(title) diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 7f3391e72b..1aef7e56cc 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -30,7 +30,7 @@ STYLE_CSS = { margin-left: 3.6em; } """, - + 'centered': """ .calibre_toc_header { text-align: center; @@ -48,18 +48,18 @@ class HTMLTOCAdder(object): def __init__(self, title=None, style='nested'): self.title = title self.style = style - + @classmethod def config(cls, cfg): group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) - group('toc_title', ['--toc-title'], default=None, + group('toc_title', ['--toc-title'], default=None, help=_('Title for any generated in-line table of contents.')) return cfg @classmethod def generate(cls, opts): return cls(title=opts.toc_title) - + def __call__(self, oeb, context): if 'toc' in oeb.guide: return diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 9582651ca0..ffa6dd6b16 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -20,7 +20,6 @@ entry_points = { 'ebook-convert = calibre.ebooks.conversion.cli:main', 'markdown-calibre = calibre.ebooks.markdown.markdown:main', 'web2disk = calibre.web.fetch.simple:main', - 'feeds2disk = calibre.web.feeds.main:main', 'calibre-server = calibre.library.server:main', 'lrf2lrs = calibre.ebooks.lrf.lrfparser:main', 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', diff --git a/src/calibre/utils/complete.py b/src/calibre/utils/complete.py index 7164e61635..c713bbe82a 100644 --- a/src/calibre/utils/complete.py +++ b/src/calibre/utils/complete.py @@ -53,7 +53,7 @@ def get_opts_from_parser(parser, prefix): for x in do_opt(o): yield x def send(ans): - pat = re.compile('([^0-9a-zA-Z_./])') + pat = re.compile('([^0-9a-zA-Z_./-])') for x in sorted(set(ans)): x = pat.sub(lambda m : '\\'+m.group(1), x) if x.endswith('\\ '): diff --git a/src/calibre/web/feeds/main.py b/src/calibre/web/feeds/main.py deleted file mode 100644 index 61bfa97e11..0000000000 --- a/src/calibre/web/feeds/main.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -CLI for downloading feeds. -''' - -import sys, os -from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles -from calibre.web.fetch.simple import option_parser as _option_parser -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.utils.config import Config, StringConfig - -def config(defaults=None): - desc = _('Options to control the fetching of periodical content from the web.') - c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc) - - web2disk = c.add_group('web2disk', _('Customize the download engine')) - web2disk('timeout', ['-t', '--timeout'], default=10.0, - help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),) - web2disk('delay', ['--delay'], default=0, - help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - web2disk('encoding', ['--encoding'], default=None, - help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) - web2disk('match_regexps', ['--match-regexp'], default=[], action='append', - help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) - web2disk('filter_regexps', ['--filter-regexp'], default=[], action='append', - help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')) - web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False, - help=_('Do not download CSS stylesheets.')) - - c.add_opt('feeds', ['--feeds'], default=None, - help=_('''Specify a list of feeds to download. For example: -"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" -If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('''Be more verbose while processing.''')) - c.add_opt('title', ['--title'], default=None, - help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')) - c.add_opt('username', ['-u', '--username'], default=None, - help=_('Username for sites that require a login to access content.')) - c.add_opt('password', ['-p', '--password'], default=None, - help=_('Password for sites that require a login to access content.')) - c.add_opt('lrf', ['--lrf'], default=False, action='store_true', - help='Optimize fetching for subsequent conversion to LRF.') - c.add_opt('epub', ['--epub'], default=False, action='store_true', - help='Optimize fetching for subsequent conversion to EPUB.') - c.add_opt('mobi', ['--mobi'], default=False, action='store_true', - help='Optimize fetching for subsequent conversion to MOBI.') - c.add_opt('recursions', ['--recursions'], default=0, - help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - c.add_opt('output_dir', ['--output-dir'], default='.', - help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) - c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true', - help=_("Don't show the progress bar")) - c.add_opt('debug', ['--debug'], action='store_true', default=False, - help=_('Very verbose output, useful for debugging.')) - c.add_opt('test', ['--test'], action='store_true', default=False, - help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - - return c - -USAGE=_('''\ -%%prog [options] ARG - -%%prog parses an online source of articles, like an RSS or ATOM feed and -fetches the article contents organized in a nice hierarchy. - -ARG can be one of: - -file name - %%prog will try to load a recipe from the file - -builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times" - -recipe as a string - %%prog will load the recipe directly from the string arg. - -Available builtin recipes are: -%s -''')%(unicode(list(titles))[1:-1]) - -def option_parser(usage=USAGE): - p = _option_parser(usage=usage) - p.remove_option('--max-recursions') - p.remove_option('--base-dir') - p.remove_option('--verbose') - p.remove_option('--max-files') - p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)')) - - p.add_option('--feeds', default=None, - help=_('''Specify a list of feeds to download. For example: -"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" -If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) - p.add_option('--verbose', default=False, action='store_true', - help=_('''Be more verbose while processing.''')) - p.add_option('--title', default=None, - help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')) - p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.')) - p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.')) - p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') - p.add_option('--recursions', default=0, type='int', - help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - p.add_option('--output-dir', default=os.getcwd(), - help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) - p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true', - help=_('Dont show the progress bar')) - p.add_option('--debug', action='store_true', default=False, - help=_('Very verbose output, useful for debugging.')) - p.add_option('--test', action='store_true', default=False, - help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - - return p - -class RecipeError(Exception): - pass - -def run_recipe(opts, recipe_arg, parser, notification=None): - if notification is None: - from calibre.utils.terminfo import TerminalController, ProgressBar - term = TerminalController(sys.stdout) - pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar) - notification = pb.update - - recipe = None - if opts.feeds is not None: - recipe = BasicNewsRecipe - else: - try: - if os.access(recipe_arg, os.R_OK): - recipe = compile_recipe(open(recipe_arg).read()) - else: - raise Exception('not file') - except: - recipe = get_builtin_recipe(recipe_arg) - if recipe is None: - recipe = compile_recipe(recipe_arg) - - if recipe is None: - raise RecipeError(recipe_arg+ ' is an invalid recipe') - - recipe = recipe(opts, parser, notification) - - if not os.path.exists(recipe.output_dir): - os.makedirs(recipe.output_dir) - recipe.download(for_lrf=True) - - return recipe - -def main(args=sys.argv, notification=None): - p = option_parser() - opts, args = p.parse_args(args=args[1:]) - - if len(args) != 1 and opts.feeds is None: - p.print_help() - return 1 - recipe_arg = args[0] if len(args) > 0 else None - run_recipe(opts, recipe_arg, p, notification=notification) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index 54e54a9a83..ffeb04f4a5 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -109,7 +109,7 @@ class Newsweek(BasicNewsRecipe): def get_cover_url(self): cover_url = None - soup = self.index_to_soup(self.INDEX) + soup = self.index_to_soup('http://www.newsweek.com') link_item = soup.find('div',attrs={'class':'cover-image'}) if link_item and link_item.a and link_item.a.img: cover_url = link_item.a.img['src']