IGN:Fix more minor regressions

2025-07-09 03:04:10 -04:00 · 2008-09-12 21:04:24 -07:00 · 2008-09-12 21:04:24 -07:00 · 829a344fe9
commit 829a344fe9
parent f7bf112ae2
6 changed files with 95 additions and 40 deletions
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
 '''
 Conversion to EPUB.
 '''
-import sys
+import sys, textwrap
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED
 from calibre.ebooks.html import config as common_config
@ -53,9 +53,21 @@ The expression used must evaluate to a list of elements. To disable chapter dete
 use the expression "/". See the XPath Tutorial in the calibre User Manual for further
 help on using this feature.
 ''').replace('\n', ' '))
-    structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
-              help=_('Don\'t add detected chapters to the Table of Contents'))
-    structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
-              help=_('Don\'t add links in the root HTML file to the Table of Contents'))
+    
+    toc = c.add_group('toc', 
+        _('''\
+Control the automatic generation of a Table of Contents. If an OPF file is detected
+and it specifies a Table of Contents, then that will be used rather than trying
+to auto-generate a Table of Contents.
+''').replace('\n', ' '))
+    toc('max_toc_recursion', ['--max-toc-recursion'], default=1, 
+        help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.'))
+    toc('max_toc_links', ['--max-toc-links'], default=40, 
+        help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.'))
+    toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
+        help=_("Don't add auto-detected chapters to the Table of Contents."))
+    toc('add_files_to_toc', ['--add-files-to-toc'], default=False,
+        help=_('If more than one HTML file is found, create a TOC entry for each file.'))
+    
    
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -1,13 +1,16 @@
 from __future__ import with_statement
+from calibre.ebooks.metadata.opf import OPFReader
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-import os, sys, re, shutil
+import os, sys, re, shutil, cStringIO
 from lxml.etree import XPath

-from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
+from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\
+    opf_traverse, create_metadata, rebase_toc
 from calibre.ebooks.epub import config as common_config
-from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.ptempfile import TemporaryDirectory
+from calibre.ebooks.metadata import MetaInformation


 class HTMLProcessor(Parser):
@ -17,7 +20,7 @@ class HTMLProcessor(Parser):
                        name='html2epub')
        if opts.verbose > 2:
            self.debug_tree('parsed')
-        self.detected_chapters = self.opts.chapter(self.root)
+        self.detect_chapters()
        self.extract_css()
        
        if opts.verbose > 2:
@ -27,6 +30,13 @@ class HTMLProcessor(Parser):
        
        self.split()
        
+    def detect_chapters(self):
+        self.detected_chapters = self.opts.chapter(self.root)
+        for elem in self.detected_chapters:
+            style = elem.get('style', '')
+            style += ';page-break-before: always'
+            elem.set(style, style)
+        
    def collect_font_statistics(self):
        '''
        Collect font statistics to figure out the base font size used in this
@ -46,37 +56,44 @@ class HTMLProcessor(Parser):
            

 def config(defaults=None):
-    c = common_config(defaults=defaults)
-    return c
+    return common_config(defaults=defaults)

 def option_parser():
    c = config()
    return c.option_parser(usage=_('''\
-%prog [options] file.html
+%prog [options] file.html|opf

-Convert a HTML file to an EPUB ebook. Follows links in the HTML file. 
+Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file.
+If you specify an OPF file instead of an HTML file, the list of links is takes from
+the <spine> element of the OPF file.  
 '''))

-def parse_content(filelist, opts):
-    tdir = PersistentTemporaryDirectory('_html2epub')
+def parse_content(filelist, opts, tdir):
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
    resource_map = {}
    for htmlfile in filelist:
        hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), 
                           resource_map, filelist)
+        hp.save()
+    return resource_map, hp.htmlfile_map

 def convert(htmlfile, opts, notification=None):
    htmlfile = os.path.abspath(htmlfile)
    if opts.output is None:
        opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
    opts.output = os.path.abspath(opts.output)
+    if htmlfile.lower().endswith('.opf'):
+        opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
+        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
+        mi = MetaInformation(opf)
+    else:
        opf, filelist = get_filelist(htmlfile, opts)
        mi = merge_metadata(htmlfile, opf, opts)
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
    
-    resource_map = parse_content(filelist, opts)
-    
+    with TemporaryDirectory('_html2epub') as tdir:
+        resource_map, htmlfile_map = parse_content(filelist, opts, tdir)
        resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
        
        if opf.cover and os.access(opf.cover, os.R_OK):
@ -86,6 +103,18 @@ def convert(htmlfile, opts, notification=None):
            resources.append(cpath)
            mi.cover = cpath
            
+        spine = [htmlfile_map[f.path] for f in filelist]
+        mi = create_metadata(tdir, mi, spine, resources)
+        buf = cStringIO.StringIO()
+        if mi.toc:
+            rebase_toc(mi.toc, htmlfile_map, opts.output)
+        with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
+            mi.render(f, buf)
+        toc = buf.getvalue()
+        if toc:
+            with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
+                f.write(toc)
+            
 def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -1,10 +1,14 @@
 from __future__ import with_statement
-import cStringIO
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

-import sys, re, os, shutil, logging, tempfile
+'''
+Code to recursively parse HTML files and create an open ebook in a specified
+directory or zip file. All the action starts in :function:`create_dir`.
+'''
+
+import sys, re, os, shutil, logging, tempfile, cStringIO
 from urlparse import urlparse
 from urllib import unquote

@ -445,10 +449,10 @@ class Parser(PreProcessor, LoggingInterface):
        self.raw_css = '\n\n'.join(css)
        # TODO: Figure out what to do about CSS imports from linked stylesheets    

-def config(defaults=None):
-    desc = _('Options to control the traversal of HTML')
+def config(defaults=None, config_name='html', 
+           desc=_('Options to control the traversal of HTML')):
    if defaults is None:
-        c = Config('html', desc)
+        c = Config(config_name, desc)
    else:
        c = StringConfig(defaults, desc)
        
@ -482,10 +486,12 @@ def config(defaults=None):
 def option_parser():
    c = config()
    return c.option_parser(usage=_('''\
-%prog [options] file.html
+%prog [options] file.html|opf

 Follow all links in an HTML file and collect them into the specified directory.
 Also collects any references resources like images, stylesheets, scripts, etc. 
+If an OPF file is specified instead, the list of files in its <spine> element
+is used.
 '''))

 def search_for_opf(dir):
@ -566,7 +572,8 @@ def create_metadata(basepath, mi, filelist, resources):

 def rebase_toc(toc, htmlfile_map, basepath, root=True):
    '''
-    Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object.
+    Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Maps all entries
+    in the TOC to point to their new locations relative to the new OPF file.
    '''
    def fix_entry(entry):
        if entry.abspath in htmlfile_map.keys():
@ -582,15 +589,23 @@ def create_dir(htmlfile, opts):
    '''
    Create a directory that contains the open ebook
    '''
+    if htmlfile.lower().endswith('.opf'):
+        opf = OPFReader(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile)))
+        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
+        mi = MetaInformation(opf)
+    else:
        opf, filelist = get_filelist(htmlfile, opts)
        mi = merge_metadata(htmlfile, opf, opts)
+    
    resource_map, htmlfile_map = parse_content(filelist, opts)
    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
+    
    if opf and opf.cover and os.access(opf.cover, os.R_OK):
        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
        shutil.copyfile(opf.cover, cpath)
        resources.append(cpath)
        mi.cover = cpath
+    
    spine = [htmlfile_map[f.path] for f in filelist]
    mi = create_metadata(opts.output, mi, spine, resources)
    buf = cStringIO.StringIO()
--- a/src/calibre/ebooks/metadata/epub.py
+++ b/src/calibre/ebooks/metadata/epub.py
@ -105,7 +105,6 @@ def set_metadata(stream, mi):
    reader.opf.smart_update(mi)
    newopf = StringIO(reader.opf.render())
    safe_replace(stream, reader.container[OPF.MIMETYPE], newopf)
-    print newopf.getvalue()
    
 def option_parser():
    parser = get_parser('epub')
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -150,7 +150,7 @@ class OPF(object):
        def fset(self, val):
            matches = self.isbn_path(self.tree)
            if not matches:
-                matches = [self.create_metadata_element('dc:identifier', 
+                matches = [self.create_metadata_element('identifier', ns='dc',
                                                attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})]
            matches[0].text = unicode(val)
        return property(fget=fget, fset=fset)