diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 8fa259694a..0d5ed517f3 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' ''' Conversion to EPUB. ''' -import sys +import sys, textwrap from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED from calibre.ebooks.html import config as common_config @@ -53,9 +53,21 @@ The expression used must evaluate to a list of elements. To disable chapter dete use the expression "/". See the XPath Tutorial in the calibre User Manual for further help on using this feature. ''').replace('\n', ' ')) - structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, - help=_('Don\'t add detected chapters to the Table of Contents')) - structure('no_links_in_toc', ['--no-links-in-toc'], default=False, - help=_('Don\'t add links in the root HTML file to the Table of Contents')) + + toc = c.add_group('toc', + _('''\ +Control the automatic generation of a Table of Contents. If an OPF file is detected +and it specifies a Table of Contents, then that will be used rather than trying +to auto-generate a Table of Contents. +''').replace('\n', ' ')) + toc('max_toc_recursion', ['--max-toc-recursion'], default=1, + help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.')) + toc('max_toc_links', ['--max-toc-links'], default=40, + help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.')) + toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, + help=_("Don't add auto-detected chapters to the Table of Contents.")) + toc('add_files_to_toc', ['--add-files-to-toc'], default=False, + help=_('If more than one HTML file is found, create a TOC entry for each file.')) + return c \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 1c21bf4c2b..525b78772f 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -1,13 +1,16 @@ from __future__ import with_statement +from calibre.ebooks.metadata.opf import OPFReader __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import os, sys, re, shutil +import os, sys, re, shutil, cStringIO from lxml.etree import XPath -from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist +from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\ + opf_traverse, create_metadata, rebase_toc from calibre.ebooks.epub import config as common_config -from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.ptempfile import TemporaryDirectory +from calibre.ebooks.metadata import MetaInformation class HTMLProcessor(Parser): @@ -17,7 +20,7 @@ class HTMLProcessor(Parser): name='html2epub') if opts.verbose > 2: self.debug_tree('parsed') - self.detected_chapters = self.opts.chapter(self.root) + self.detect_chapters() self.extract_css() if opts.verbose > 2: @@ -27,6 +30,13 @@ class HTMLProcessor(Parser): self.split() + def detect_chapters(self): + self.detected_chapters = self.opts.chapter(self.root) + for elem in self.detected_chapters: + style = elem.get('style', '') + style += ';page-break-before: always' + elem.set(style, style) + def collect_font_statistics(self): ''' Collect font statistics to figure out the base font size used in this @@ -46,45 +56,64 @@ class HTMLProcessor(Parser): def config(defaults=None): - c = common_config(defaults=defaults) - return c + return common_config(defaults=defaults) def option_parser(): c = config() return c.option_parser(usage=_('''\ -%prog [options] file.html +%prog [options] file.html|opf -Convert a HTML file to an EPUB ebook. Follows links in the HTML file. +Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file. +If you specify an OPF file instead of an HTML file, the list of links is takes from +the element of the OPF file. ''')) -def parse_content(filelist, opts): - tdir = PersistentTemporaryDirectory('_html2epub') +def parse_content(filelist, opts, tdir): os.makedirs(os.path.join(tdir, 'content', 'resources')) resource_map = {} for htmlfile in filelist: hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), resource_map, filelist) + hp.save() + return resource_map, hp.htmlfile_map def convert(htmlfile, opts, notification=None): htmlfile = os.path.abspath(htmlfile) if opts.output is None: opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' opts.output = os.path.abspath(opts.output) - opf, filelist = get_filelist(htmlfile, opts) - mi = merge_metadata(htmlfile, opf, opts) + if htmlfile.lower().endswith('.opf'): + opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) + filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + mi = MetaInformation(opf) + else: + opf, filelist = get_filelist(htmlfile, opts) + mi = merge_metadata(htmlfile, opf, opts) opts.chapter = XPath(opts.chapter, namespaces={'re':'http://exslt.org/regular-expressions'}) - resource_map = parse_content(filelist, opts) - - resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] - - if opf.cover and os.access(opf.cover, os.R_OK): - shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) - cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) - shutil.copyfile(opf.cover, cpath) - resources.append(cpath) - mi.cover = cpath + with TemporaryDirectory('_html2epub') as tdir: + resource_map, htmlfile_map = parse_content(filelist, opts, tdir) + resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] + + if opf.cover and os.access(opf.cover, os.R_OK): + shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) + cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) + shutil.copyfile(opf.cover, cpath) + resources.append(cpath) + mi.cover = cpath + + spine = [htmlfile_map[f.path] for f in filelist] + mi = create_metadata(tdir, mi, spine, resources) + buf = cStringIO.StringIO() + if mi.toc: + rebase_toc(mi.toc, htmlfile_map, opts.output) + with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: + mi.render(f, buf) + toc = buf.getvalue() + if toc: + with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: + f.write(toc) def main(args=sys.argv): parser = option_parser() diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 5f482ab39e..3e3531697b 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -1,10 +1,14 @@ from __future__ import with_statement -import cStringIO __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import sys, re, os, shutil, logging, tempfile +''' +Code to recursively parse HTML files and create an open ebook in a specified +directory or zip file. All the action starts in :function:`create_dir`. +''' + +import sys, re, os, shutil, logging, tempfile, cStringIO from urlparse import urlparse from urllib import unquote @@ -445,10 +449,10 @@ class Parser(PreProcessor, LoggingInterface): self.raw_css = '\n\n'.join(css) # TODO: Figure out what to do about CSS imports from linked stylesheets -def config(defaults=None): - desc = _('Options to control the traversal of HTML') +def config(defaults=None, config_name='html', + desc=_('Options to control the traversal of HTML')): if defaults is None: - c = Config('html', desc) + c = Config(config_name, desc) else: c = StringConfig(defaults, desc) @@ -482,10 +486,12 @@ def config(defaults=None): def option_parser(): c = config() return c.option_parser(usage=_('''\ -%prog [options] file.html +%prog [options] file.html|opf Follow all links in an HTML file and collect them into the specified directory. Also collects any references resources like images, stylesheets, scripts, etc. +If an OPF file is specified instead, the list of files in its element +is used. ''')) def search_for_opf(dir): @@ -566,7 +572,8 @@ def create_metadata(basepath, mi, filelist, resources): def rebase_toc(toc, htmlfile_map, basepath, root=True): ''' - Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. + Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Maps all entries + in the TOC to point to their new locations relative to the new OPF file. ''' def fix_entry(entry): if entry.abspath in htmlfile_map.keys(): @@ -582,15 +589,23 @@ def create_dir(htmlfile, opts): ''' Create a directory that contains the open ebook ''' - opf, filelist = get_filelist(htmlfile, opts) - mi = merge_metadata(htmlfile, opf, opts) + if htmlfile.lower().endswith('.opf'): + opf = OPFReader(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile))) + filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + mi = MetaInformation(opf) + else: + opf, filelist = get_filelist(htmlfile, opts) + mi = merge_metadata(htmlfile, opf, opts) + resource_map, htmlfile_map = parse_content(filelist, opts) resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] + if opf and opf.cover and os.access(opf.cover, os.R_OK): cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) shutil.copyfile(opf.cover, cpath) resources.append(cpath) mi.cover = cpath + spine = [htmlfile_map[f.path] for f in filelist] mi = create_metadata(opts.output, mi, spine, resources) buf = cStringIO.StringIO() diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index d0de9cbdcd..6a1993d9f0 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -105,7 +105,6 @@ def set_metadata(stream, mi): reader.opf.smart_update(mi) newopf = StringIO(reader.opf.render()) safe_replace(stream, reader.container[OPF.MIMETYPE], newopf) - print newopf.getvalue() def option_parser(): parser = get_parser('epub') diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 51b5035290..b61ed3fad1 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -150,7 +150,7 @@ class OPF(object): def fset(self, val): matches = self.isbn_path(self.tree) if not matches: - matches = [self.create_metadata_element('dc:identifier', + matches = [self.create_metadata_element('identifier', ns='dc', attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})] matches[0].text = unicode(val) return property(fget=fget, fset=fset) diff --git a/src/calibre/parallel.py b/src/calibre/parallel.py index 1f4816b25e..2fb206af8b 100644 --- a/src/calibre/parallel.py +++ b/src/calibre/parallel.py @@ -920,8 +920,8 @@ def worker(host, port): msg = 'ERROR:'+cPickle.dumps((exception, tb),-1) write(client_socket, msg) res = read(client_socket, 10) - if res != 'OK': - break + if res != 'OK': + break gc.collect() elif msg == 'PING:': write(client_socket, 'OK')