From 1770f7bf74cf1c9330327b76778206b89ac4e7e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 19 Apr 2009 14:44:37 -0700 Subject: [PATCH] Ported structure detection code and added plugin for FB2 input. --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/conversion/cli.py | 25 ++- src/calibre/ebooks/conversion/plumber.py | 91 ++++++++++- src/calibre/ebooks/epub/from_any.py | 109 ++----------- src/calibre/ebooks/{lrf => }/fb2/__init__.py | 0 src/calibre/ebooks/{lrf => }/fb2/fb2.xsl | 0 src/calibre/ebooks/fb2/input.py | 74 +++++++++ src/calibre/ebooks/lrf/fb2/convert_from.py | 125 --------------- src/calibre/ebooks/oeb/base.py | 61 ++++++- src/calibre/ebooks/oeb/iterator.py | 8 +- src/calibre/ebooks/oeb/reader.py | 3 +- src/calibre/ebooks/oeb/transforms/split.py | 6 +- .../ebooks/oeb/transforms/structure.py | 151 ++++++++++++++++++ src/calibre/linux.py | 10 -- upload.py | 2 +- 15 files changed, 422 insertions(+), 247 deletions(-) rename src/calibre/ebooks/{lrf => }/fb2/__init__.py (100%) rename src/calibre/ebooks/{lrf => }/fb2/fb2.xsl (100%) create mode 100644 src/calibre/ebooks/fb2/input.py delete mode 100644 src/calibre/ebooks/lrf/fb2/convert_from.py create mode 100644 src/calibre/ebooks/oeb/transforms/structure.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 08824a3591..a56d13fd60 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -281,6 +281,7 @@ from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput +from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -288,7 +289,8 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] + TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, + FB2Input] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index b7336ab30a..6d5401278a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -119,6 +119,24 @@ def add_pipeline_options(parser, plumber): ] ), + 'STRUCTURE DETECTION' : ( + _('Control auto-detection of document structure.'), + [ + 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', + ] + ), + + 'TABLE OF CONTENTS' : ( + _('Control the automatic generation of a Table of Contents. By ' + 'default, if the source file has a Table of Contents, it will ' + 'be used in preference to the automatically generated one.'), + [ + 'level1_toc', 'level2_toc', 'level3_toc', + 'toc_threshold', 'max_toc_links', 'no_chapters_in_toc', + 'use_auto_toc', + ] + ), + 'METADATA' : (_('Options to set metadata in the output'), plumber.metadata_option_names, ), @@ -130,7 +148,8 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'LOOK AND FEEL', 'METADATA', 'DEBUG'] + group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION', + 'TABLE OF CONTENTS', 'METADATA', 'DEBUG'] for group in group_order: desc, options = groups[group] @@ -163,6 +182,10 @@ def main(args=sys.argv): add_pipeline_options(parser, plumber) opts = parser.parse_args(args)[0] + y = lambda q : os.path.abspath(os.path.expanduser(q)) + for x in ('read_metadata_from_opf', 'cover'): + if getattr(opts, x, None) is not None: + setattr(opts, x, y(getattr(opts, x))) recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) \ for n in parser.options_iter() diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 1edeed8d9c..453591e433 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -121,6 +121,88 @@ OptionRecommendation(name='dont_split_on_page_breaks', ) ), +OptionRecommendation(name='level1_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that ' + 'should be added to the Table of Contents at level one. If ' + 'this is specified, it takes precedence over other forms ' + 'of auto-detection.' + ) + ), + +OptionRecommendation(name='level2_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level two. Each entry is added ' + 'under the previous level one entry.' + ) + ), + +OptionRecommendation(name='level3_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level three. Each entry ' + 'is added under the previous level two entry.' + ) + ), + +OptionRecommendation(name='use_auto_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally, if the source file already has a Table of ' + 'Contents, it is used in preference to the auto-generated one. ' + 'With this option, the auto-generated one is always used.' + ) + ), + +OptionRecommendation(name='no_chapters_in_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_("Don't add auto-detected chapters to the Table of " + 'Contents.' + ) + ), + +OptionRecommendation(name='toc_threshold', + recommended_value=6, level=OptionRecommendation.LOW, + help=_( + 'If fewer than this number of chapters is detected, then links ' + 'are added to the Table of Contents. Default: %default') + ), + +OptionRecommendation(name='max_toc_links', + recommended_value=50, level=OptionRecommendation.LOW, + help=_('Maximum number of links to insert into the TOC. Set to 0 ' + 'to disable. Default is: %default. Links are only added to the ' + 'TOC if less than the threshold number of chapters were detected.' + ) + ), + +OptionRecommendation(name='chapter', + recommended_value="//*[((name()='h1' or name()='h2') and " + "re:test(., 'chapter|book|section|part', 'i')) or @class " + "= 'chapter']", level=OptionRecommendation.LOW, + help=_('An XPath expression to detect chapter titles. The default ' + 'is to consider

or

tags that contain the words ' + '"chapter","book","section" or "part" as chapter titles as ' + 'well as any tags that have class="chapter". The expression ' + 'used must evaluate to a list of elements. To disable chapter ' + 'detection, use the expression "/". See the XPath Tutorial ' + 'in the calibre User Manual for further help on using this ' + 'feature.' + ) + ), + +OptionRecommendation(name='chapter_mark', + recommended_value='pagebreak', level=OptionRecommendation.LOW, + choices=['pagebreak', 'rule', 'both', 'none'], + help=_('Specify how to mark detected chapters. A value of ' + '"pagebreak" will insert page breaks before chapters. ' + 'A value of "rule" will insert a line before chapters. ' + 'A value of "none" will disable chapter marking and a ' + 'value of "both" will use both page breaks and lines ' + 'to mark chapters.') + ), + + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -130,6 +212,7 @@ OptionRecommendation(name='read_metadata_from_opf', 'file.') ), + OptionRecommendation(name='title', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the title.')), @@ -237,6 +320,7 @@ OptionRecommendation(name='language', rec = self.get_option_by_name(name) if rec is not None and rec.level <= level: rec.recommended_value = val + rec.level = level def merge_ui_recommendations(self, recommendations): ''' @@ -248,6 +332,7 @@ OptionRecommendation(name='language', rec = self.get_option_by_name(name) if rec is not None and rec.level <= level and rec.level < rec.HIGH: rec.recommended_value = val + rec.level = level def read_user_metadata(self): ''' @@ -332,6 +417,9 @@ OptionRecommendation(name='language', self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile + from calibre.ebooks.oeb.transforms.structure import DetectStructure + DetectStructure()(self.oeb, self.opts) + from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener fbase = self.opts.base_font_size if fbase == 0: @@ -364,6 +452,8 @@ OptionRecommendation(name='language', trimmer = ManifestTrimmer() trimmer(self.oeb, self.opts) + self.oeb.toc.rationalize_play_orders() + self.log.info('Creating %s...'%self.output_plugin.name) self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) @@ -384,4 +474,3 @@ def create_oebbook(log, path_or_stream, opts, reader=None): reader()(oeb, path_or_stream) return oeb - diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index b3e5281525..196ed59646 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -15,88 +15,15 @@ from calibre.ebooks import DRMError from calibre.ebooks.epub import config as common_config from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.utils.zipfile import ZipFile from calibre.customize.ui import run_plugins_on_preprocess -def lit2opf(path, tdir, opts): - from calibre.ebooks.lit.reader import LitReader - print 'Exploding LIT file:', path - reader = LitReader(path) - reader.extract_content(tdir, False) - opf = None - for opf in walk(tdir): - if opf.lower().endswith('.opf'): - break - if not opf.endswith('.opf'): - opf = None - if opf is not None: # Check for url-quoted filenames - _opf = OPF(opf, os.path.dirname(opf)) - replacements = [] - for item in _opf.itermanifest(): - href = item.get('href', '') - path = os.path.join(os.path.dirname(opf), *(href.split('/'))) - if not os.path.exists(path) and os.path.exists(path.replace('&', '%26')): - npath = path - path = path.replace('&', '%26') - replacements.append((path, npath)) - if replacements: - print 'Fixing quoted filenames...' - for path, npath in replacements: - if os.path.exists(path): - os.rename(path, npath) - for f in walk(tdir): - with open(f, 'r+b') as f: - raw = f.read() - for path, npath in replacements: - raw = raw.replace(os.path.basename(path), os.path.basename(npath)) - f.seek(0) - f.truncate() - f.write(raw) - return opf -def mobi2opf(path, tdir, opts): - from calibre.ebooks.mobi.reader import MobiReader - print 'Exploding MOBI file:', path.encode('utf-8') if isinstance(path, unicode) else path - reader = MobiReader(path) - reader.extract_content(tdir) - files = list(walk(tdir)) - opts.encoding = 'utf-8' - for f in files: - if f.lower().endswith('.opf'): - return f - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE) - hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None] - mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(hf[0], None)]) - opf.create_spine([hf[0]]) - ans = os.path.join(tdir, 'metadata.opf') - opf.render(open(ans, 'wb')) - return ans - -def fb22opf(path, tdir, opts): - from calibre.ebooks.lrf.fb2.convert_from import to_html - print 'Converting FB2 to HTML...' - return to_html(path, tdir) - def rtf2opf(path, tdir, opts): from calibre.ebooks.lrf.rtf.convert_from import generate_html generate_html(path, tdir) return os.path.join(tdir, 'metadata.opf') -def txt2opf(path, tdir, opts): - from calibre.ebooks.lrf.txt.convert_from import generate_html - generate_html(path, opts.encoding, tdir) - return os.path.join(tdir, 'metadata.opf') - -def pdf2opf(path, tdir, opts): - from calibre.ebooks.lrf.pdf.convert_from import generate_html - generate_html(path, tdir) - opts.dont_split_on_page_breaks = True - return os.path.join(tdir, 'metadata.opf') - def epub2opf(path, tdir, opts): zf = ZipFile(path) zf.extractall(tdir) @@ -110,35 +37,23 @@ def epub2opf(path, tdir, opts): if opf and os.path.exists(encfile): if not process_encryption(encfile, opf): raise DRMError(os.path.basename(path)) - + if opf is None: raise ValueError('%s is not a valid EPUB file'%path) return opf - + def odt2epub(path, tdir, opts): from calibre.ebooks.odt.to_oeb import Extract opts.encoding = 'utf-8' return Extract()(path, tdir) -MAP = { - 'lit' : lit2opf, - 'mobi' : mobi2opf, - 'prc' : mobi2opf, - 'azw' : mobi2opf, - 'fb2' : fb22opf, - 'rtf' : rtf2opf, - 'txt' : txt2opf, - 'pdf' : pdf2opf, - 'epub' : epub2opf, - 'odt' : odt2epub, - } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', +SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] def unarchive(path, tdir): extract(path, tdir) files = list(walk(tdir)) - + for ext in ['opf'] + list(MAP.keys()): for f in files: if f.lower().endswith('.'+ext): @@ -147,32 +62,32 @@ def unarchive(path, tdir): return f, ext return find_html_index(files) -def any2epub(opts, path, notification=None, create_epub=True, +def any2epub(opts, path, notification=None, create_epub=True, oeb_cover=False, extract_to=None): path = run_plugins_on_preprocess(path) ext = os.path.splitext(path)[1] if not ext: raise ValueError('Unknown file type: '+path) ext = ext.lower()[1:] - + if opts.output is None: opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - + with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): if ext in ['rar', 'zip', 'oebzip']: path, ext = unarchive(path, tdir1) print 'Found %s file in archive'%(ext.upper()) - + if ext in MAP.keys(): path = MAP[ext](path, tdir2, opts) ext = 'opf' - - + + if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: raise ValueError('Conversion from %s is not supported'%ext.upper()) - + print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, + html2epub(path, opts, notification=notification, create_epub=create_epub, oeb_cover=oeb_cover, extract_to=extract_to) diff --git a/src/calibre/ebooks/lrf/fb2/__init__.py b/src/calibre/ebooks/fb2/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/fb2/__init__.py rename to src/calibre/ebooks/fb2/__init__.py diff --git a/src/calibre/ebooks/lrf/fb2/fb2.xsl b/src/calibre/ebooks/fb2/fb2.xsl similarity index 100% rename from src/calibre/ebooks/lrf/fb2/fb2.xsl rename to src/calibre/ebooks/fb2/fb2.xsl diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py new file mode 100644 index 0000000000..d96758a4bd --- /dev/null +++ b/src/calibre/ebooks/fb2/input.py @@ -0,0 +1,74 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Anatoly Shipitsin ' +""" +Convert .fb2 files to .lrf +""" +import os +from base64 import b64decode +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import guess_type + +FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' + +class FB2Input(InputFormatPlugin): + + name = 'FB2 Input' + author = 'Anatoly Shipitsin' + description = 'Convert FB2 files to HTML' + file_types = set(['fb2']) + + recommendations = set([ + ('level1_toc', '//h:h1', OptionRecommendation.MED), + ('level2_toc', '//h:h2', OptionRecommendation.MED), + ('level3_toc', '//h:h3', OptionRecommendation.MED), + ]) + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.resources import fb2_xsl + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.oeb.base import XLINK_NS + NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} + + log.debug('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.parse(stream, parser) + self.extract_embedded_content(doc) + log.debug('Converting XML to HTML...') + styledoc = etree.fromstring(fb2_xsl) + + transform = etree.XSLT(styledoc) + result = transform(doc) + open('index.xhtml', 'wb').write(transform.tostring(result)) + stream.seek(0) + mi = get_metadata(stream, 'fb2') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwdu(), mi) + entries = [(f, guess_type(f)[0]) for f in os.listdir('.')] + opf.create_manifest(entries) + opf.create_spine(['index.xhtml']) + + for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): + href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) + if href is not None: + if href.startswith('#'): + href = href[1:] + opf.guide.set_cover(os.path.abspath(href)) + + opf.render(open('metadata.opf', 'wb')) + return os.path.join(os.getcwd(), 'metadata.opf') + + def extract_embedded_content(self, doc): + for elem in doc.xpath('./*'): + if 'binary' in elem.tag and elem.attrib.has_key('id'): + fname = elem.attrib['id'] + data = b64decode(elem.text.strip()) + open(fname, 'wb').write(data) + diff --git a/src/calibre/ebooks/lrf/fb2/convert_from.py b/src/calibre/ebooks/lrf/fb2/convert_from.py deleted file mode 100644 index 24562e708c..0000000000 --- a/src/calibre/ebooks/lrf/fb2/convert_from.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Anatoly Shipitsin ' -""" -Convert .fb2 files to .lrf -""" -import os, sys, shutil, logging -from base64 import b64decode -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.resources import fb2_xsl -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation - - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.fb2 - - -%prog converts mybook.fb2 to mybook.lrf''')) - parser.add_option('--debug-html-generation', action='store_true', default=False, - dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.')) - parser.add_option('--keep-intermediate-files', action='store_true', default=False, - help=_('Keep generated HTML files after completing conversion to LRF.')) - return parser - -def extract_embedded_content(doc): - for elem in doc.xpath('./*'): - if 'binary' in elem.tag and elem.attrib.has_key('id'): - fname = elem.attrib['id'] - data = b64decode(elem.text.strip()) - open(fname, 'wb').write(data) - -def to_html(fb2file, tdir): - fb2file = os.path.abspath(fb2file) - cwd = os.getcwd() - try: - os.chdir(tdir) - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - doc = etree.parse(fb2file, parser) - extract_embedded_content(doc) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(fb2_xsl) - - transform = etree.XSLT(styledoc) - result = transform(doc) - open('index.html', 'wb').write(transform.tostring(result)) - try: - mi = get_metadata(open(fb2file, 'rb'), 'fb2') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2file))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - return os.path.join(tdir, 'metadata.opf') - finally: - os.chdir(cwd) - - -def generate_html(fb2file, encoding, logger): - tdir = PersistentTemporaryDirectory('_fb22lrf') - to_html(fb2file, tdir) - return os.path.join(tdir, 'index.html') - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('fb22lrf') - setup_cli_handlers(logger, level) - fb2 = os.path.abspath(os.path.expanduser(path)) - f = open(fb2, 'rb') - mi = get_metadata(f, 'fb2') - f.close() - htmlfile = generate_html(fb2, options.encoding, logger) - tdir = os.path.dirname(htmlfile) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2))[0] - if (not options.title or options.title == _('Unknown')): - options.title = mi.title - if (not options.author or options.author == _('Unknown')) and mi.authors: - options.author = mi.authors.pop() - if (not options.category or options.category == _('Unknown')) and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == _('Unknown')) and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(htmlfile, options, logger) - finally: - os.chdir(cwd) - if getattr(options, 'keep_intermediate_files', False): - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No fb2 file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index dda36a7500..85510e2127 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -41,10 +41,12 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' +RE_NS = 'http://exslt.org/regular-expressions' + XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS} + 'svg': SVG_NS, 'xl' : XLINK_NS, 're': RE_NS} OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, 'xsi': XSI_NS, 'calibre': CALIBRE_NS} @@ -1256,16 +1258,21 @@ class TOC(object): :attr:`klass`: Optional semantic class referenced by this node. :attr:`id`: Option unique identifier for this node. """ - def __init__(self, title=None, href=None, klass=None, id=None): + def __init__(self, title=None, href=None, klass=None, id=None, + play_order=None): self.title = title self.href = urlnormalize(href) if href else href self.klass = klass self.id = id self.nodes = [] + self.play_order = 0 + if play_order is None: + play_order = self.next_play_order() + self.play_order = play_order - def add(self, title, href, klass=None, id=None): + def add(self, title, href, klass=None, id=None, play_order=0): """Create and return a new sub-node of this node.""" - node = TOC(title, href, klass, id) + node = TOC(title, href, klass, id, play_order) self.nodes.append(node) return node @@ -1276,6 +1283,18 @@ class TOC(object): for node in child.iter(): yield node + def count(self): + return len(list(self.iter())) - 1 + + def next_play_order(self): + return max([x.play_order for x in self.iter()])+1 + + def has_href(self, href): + for x in self.iter(): + if x.href == href: + return True + return False + def iterdescendants(self): """Iterate over all descendant nodes in depth-first order.""" for child in self.nodes: @@ -1309,6 +1328,10 @@ class TOC(object): except ValueError: return 1 + def __str__(self): + return 'TOC: %s --> %s'%(self.title, self.href) + + def to_opf1(self, tour): for node in self.nodes: element(tour, 'site', attrib={ @@ -1319,7 +1342,7 @@ class TOC(object): def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) - attrib = {'id': id, 'playOrder': '0'} + attrib = {'id': id, 'playOrder': str(node.play_order)} if node.klass: attrib['class'] = node.klass point = element(parent, NCX('navPoint'), attrib=attrib) @@ -1329,6 +1352,34 @@ class TOC(object): node.to_ncx(point) return parent + def rationalize_play_orders(self): + ''' + Ensure that all nodes with the same play_order have the same href and + with different play_orders have different hrefs. + ''' + def po_node(n): + for x in self.iter(): + if x is n: + return + if x.play_order == n.play_order: + return x + + def href_node(n): + for x in self.iter(): + if x is n: + return + if x.href == n.href: + return x + + for x in self.iter(): + y = po_node(x) + if y is not None: + if x.href != y.href: + x.play_order = getattr(href_node(x), 'play_order', + self.next_play_order()) + y = href_node(x) + if y is not None: + x.play_order = y.play_order class PageList(object): """Collection of named "pages" to mapped positions within an OEB data model diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 81e1f89029..ab3e90083d 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -118,6 +118,7 @@ class EbookIterator(object): print 'Loaded embedded font:', repr(family) def __enter__(self): + self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() from calibre.ebooks.conversion.plumber import Plumber @@ -137,9 +138,11 @@ class EbookIterator(object): cover = self.opf.cover if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover: - cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html') + cfile = os.path.join(os.path.dirname(self.spine[0]), + 'calibre_iterator_cover.html') open(cfile, 'wb').write(TITLEPAGE%cover) self.spine[0:0] = [SpineItem(cfile)] + self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: @@ -221,3 +224,6 @@ class EbookIterator(object): def __exit__(self, *args): self._tdir.__exit__(*args) + for x in self.delete_on_exit: + if os.path.exists(x): + os.remove(x) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 6f0ff44bc9..02b3b92b01 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -343,7 +343,8 @@ class OEBReader(object): continue id = child.get('id') klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) + po = int(child.get('playOrder', self.oeb.toc.next_play_order())) + node = toc.add(title, href, id=id, klass=klass, play_order=po) self._toc_from_navpoint(item, node, child) def _toc_from_ncx(self, item): diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 33ab14b73d..bc7e4e195d 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -15,12 +15,10 @@ from lxml.etree import XPath as _XPath from lxml import etree from lxml.cssselect import CSSSelector -from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \ - rewrite_links +from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ + urldefrag, rewrite_links from calibre.ebooks.epub import tostring, rules -NAMESPACES = dict(XPNSMAP) -NAMESPACES['re'] = 'http://exslt.org/regular-expressions' XPath = functools.partial(_XPath, namespaces=NAMESPACES) diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py new file mode 100644 index 0000000000..0f1502ef03 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from lxml import etree +from urlparse import urlparse + +from calibre.ebooks.oeb.base import XPNSMAP, TOC +XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP) + +class DetectStructure(object): + + def __call__(self, oeb, opts): + self.log = oeb.log + self.oeb = oeb + self.opts = opts + self.log('Detecting structure...') + + self.detect_chapters() + if self.oeb.auto_generated_toc or opts.use_auto_toc: + orig_toc = self.oeb.toc + self.oeb.toc = TOC() + self.create_level_based_toc() + if self.oeb.toc.count() < 1: + if not opts.no_chapters_in_toc and self.detected_chapters: + self.create_toc_from_chapters() + if self.oeb.toc.count() < opts.toc_threshold: + self.create_toc_from_links() + if self.oeb.toc.count() < 2 and orig_toc.count() > 2: + self.oeb.toc = orig_toc + else: + self.oeb.auto_generated_toc = True + self.log('Auto generated TOC with %d entries.' % + self.oeb.toc.count()) + + + def detect_chapters(self): + self.detected_chapters = [] + if self.opts.chapter: + chapter_xpath = XPath(self.opts.chapter) + for item in self.oeb.spine: + for x in chapter_xpath(item.data): + self.detected_chapters.append((item, x)) + + chapter_mark = self.opts.chapter_mark + page_break_before = 'display: block; page-break-before: always' + page_break_after = 'display: block; page-break-after: always' + for item, elem in self.detected_chapters: + text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + self.log('\tDetected chapter:', text[:50]) + if chapter_mark == 'none': + continue + elif chapter_mark == 'rule': + mark = etree.Element('hr') + elif chapter_mark == 'pagebreak': + mark = etree.Element('div', style=page_break_after) + else: # chapter_mark == 'both': + mark = etree.Element('hr', style=page_break_before) + elem.addprevious(mark) + + def create_level_based_toc(self): + if self.opts.level1_toc is None: + return + for item in self.oeb.spine: + self.add_leveled_toc_items(item) + + def create_toc_from_chapters(self): + counter = self.oeb.toc.next_play_order() + for item, elem in self.detected_chapters: + text, href = self.elem_to_link(item, elem, counter) + self.oeb.toc.add(text, href, play_order=counter) + counter += 1 + + def create_toc_from_links(self): + for item in self.oeb.spine: + for a in item.data.xpath('//h:a[@href]'): + href = a.get('href') + purl = urlparse(href) + if not purl[0] or purl[0] == 'file': + href, frag = purl.path, purl.fragment + href = item.abshref(href) + if frag: + href = '#'.join((href, frag)) + if not self.oeb.toc.has_href(href): + text = u' '.join([t.strip() for t in \ + a.xpath('descendant::text()')]) + text = text[:100].strip() + if not self.oeb.toc.has_text(text): + self.oeb.toc.add(text, href, + play_order=self.oeb.toc.next_play_order()) + + + def elem_to_link(self, item, elem, counter): + text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + text = text[:100].strip() + id = elem.get('id', 'calibre_toc_%d'%counter) + elem.set('id', id) + href = '#'.join((item.href, id)) + return text, href + + + def add_leveled_toc_items(self, item): + level1 = XPath(self.opts.level1_toc)(item.data) + level1_order = [] + + counter = 1 + if level1: + added = {} + for elem in level1: + text, _href = self.elem_to_link(item, elem, counter) + counter += 1 + if text: + node = self.oeb.toc.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + level1_order.append(node) + added[elem] = node + #node.add(_('Top'), _href) + if self.opts.level2_toc is not None: + added2 = {} + level2 = list(XPath(self.opts.level2_toc)(item.data)) + for elem in level2: + level1 = None + for item in item.data.iterdescendants(): + if item in added.keys(): + level1 = added[item] + elif item == elem and level1 is not None: + text, _href = self.elem_to_link(item, elem, counter) + counter += 1 + if text: + added2[elem] = level1.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + if self.opts.level3_toc is not None: + level3 = list(XPath(self.opts.level3_toc)(item.data)) + for elem in level3: + level2 = None + for item in item.data.iterdescendants(): + if item in added2.keys(): + level2 = added2[item] + elif item == elem and level2 is not None: + text, _href = \ + self.elem_to_link(item, elem, counter) + counter += 1 + if text: + level2.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index ee51370b61..2d13ea2730 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -27,10 +27,6 @@ entry_points = { 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', 'isbndb = calibre.ebooks.metadata.isbndb:main', 'librarything = calibre.ebooks.metadata.library_thing:main', - 'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main', - 'comic2epub = calibre.ebooks.epub.from_comic:main', - 'comic2mobi = calibre.ebooks.mobi.from_comic:main', - 'comic2pdf = calibre.ebooks.pdf.from_comic:main', 'calibre-debug = calibre.debug:main', 'calibredb = calibre.library.cli:main', 'calibre-fontconfig = calibre.utils.fontconfig:main', @@ -151,8 +147,6 @@ def setup_completion(fatal_errors): from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop from calibre.web.feeds.main import option_parser as feeds2disk from calibre.web.feeds.recipes import titles as feed_titles - from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop - from calibre.ebooks.epub.from_comic import option_parser as comic2epub from calibre.ebooks.metadata.fetch import option_parser as fem_op from calibre.gui2.main import option_parser as guiop from calibre.utils.smtp import option_parser as smtp_op @@ -181,10 +175,6 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes()))) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) - f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr'])) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('fetch-ebook-metadata', fem_op, [])) f.write(opts_and_words('calibre-smtp', smtp_op, [])) diff --git a/upload.py b/upload.py index 6bc90aada2..a29e5b097c 100644 --- a/upload.py +++ b/upload.py @@ -139,7 +139,7 @@ class resources(OptionlessCommand): RESOURCES = dict( opf_template = 'ebooks/metadata/opf.xml', ncx_template = 'ebooks/metadata/ncx.xml', - fb2_xsl = 'ebooks/lrf/fb2/fb2.xsl', + fb2_xsl = 'ebooks/fb2/fb2.xsl', metadata_sqlite = 'library/metadata_sqlite.sql', jquery = 'gui2/viewer/jquery.js', jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js',