IGN:Fix more minor regressions

This commit is contained in:
Kovid Goyal 2008-09-12 21:04:24 -07:00
parent f7bf112ae2
commit 829a344fe9
6 changed files with 95 additions and 40 deletions

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
Conversion to EPUB. Conversion to EPUB.
''' '''
import sys import sys, textwrap
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED
from calibre.ebooks.html import config as common_config from calibre.ebooks.html import config as common_config
@ -53,9 +53,21 @@ The expression used must evaluate to a list of elements. To disable chapter dete
use the expression "/". See the XPath Tutorial in the calibre User Manual for further use the expression "/". See the XPath Tutorial in the calibre User Manual for further
help on using this feature. help on using this feature.
''').replace('\n', ' ')) ''').replace('\n', ' '))
structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
help=_('Don\'t add detected chapters to the Table of Contents')) toc = c.add_group('toc',
structure('no_links_in_toc', ['--no-links-in-toc'], default=False, _('''\
help=_('Don\'t add links in the root HTML file to the Table of Contents')) Control the automatic generation of a Table of Contents. If an OPF file is detected
and it specifies a Table of Contents, then that will be used rather than trying
to auto-generate a Table of Contents.
''').replace('\n', ' '))
toc('max_toc_recursion', ['--max-toc-recursion'], default=1,
help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.'))
toc('max_toc_links', ['--max-toc-links'], default=40,
help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.'))
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
help=_("Don't add auto-detected chapters to the Table of Contents."))
toc('add_files_to_toc', ['--add-files-to-toc'], default=False,
help=_('If more than one HTML file is found, create a TOC entry for each file.'))
return c return c

View File

@ -1,13 +1,16 @@
from __future__ import with_statement from __future__ import with_statement
from calibre.ebooks.metadata.opf import OPFReader
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, sys, re, shutil import os, sys, re, shutil, cStringIO
from lxml.etree import XPath from lxml.etree import XPath
from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\
opf_traverse, create_metadata, rebase_toc
from calibre.ebooks.epub import config as common_config from calibre.ebooks.epub import config as common_config
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation
class HTMLProcessor(Parser): class HTMLProcessor(Parser):
@ -17,7 +20,7 @@ class HTMLProcessor(Parser):
name='html2epub') name='html2epub')
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('parsed') self.debug_tree('parsed')
self.detected_chapters = self.opts.chapter(self.root) self.detect_chapters()
self.extract_css() self.extract_css()
if opts.verbose > 2: if opts.verbose > 2:
@ -27,6 +30,13 @@ class HTMLProcessor(Parser):
self.split() self.split()
def detect_chapters(self):
self.detected_chapters = self.opts.chapter(self.root)
for elem in self.detected_chapters:
style = elem.get('style', '')
style += ';page-break-before: always'
elem.set(style, style)
def collect_font_statistics(self): def collect_font_statistics(self):
''' '''
Collect font statistics to figure out the base font size used in this Collect font statistics to figure out the base font size used in this
@ -46,45 +56,64 @@ class HTMLProcessor(Parser):
def config(defaults=None): def config(defaults=None):
c = common_config(defaults=defaults) return common_config(defaults=defaults)
return c
def option_parser(): def option_parser():
c = config() c = config()
return c.option_parser(usage=_('''\ return c.option_parser(usage=_('''\
%prog [options] file.html %prog [options] file.html|opf
Convert a HTML file to an EPUB ebook. Follows links in the HTML file. Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file.
If you specify an OPF file instead of an HTML file, the list of links is takes from
the <spine> element of the OPF file.
''')) '''))
def parse_content(filelist, opts): def parse_content(filelist, opts, tdir):
tdir = PersistentTemporaryDirectory('_html2epub')
os.makedirs(os.path.join(tdir, 'content', 'resources')) os.makedirs(os.path.join(tdir, 'content', 'resources'))
resource_map = {} resource_map = {}
for htmlfile in filelist: for htmlfile in filelist:
hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
resource_map, filelist) resource_map, filelist)
hp.save()
return resource_map, hp.htmlfile_map
def convert(htmlfile, opts, notification=None): def convert(htmlfile, opts, notification=None):
htmlfile = os.path.abspath(htmlfile) htmlfile = os.path.abspath(htmlfile)
if opts.output is None: if opts.output is None:
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
opts.output = os.path.abspath(opts.output) opts.output = os.path.abspath(opts.output)
opf, filelist = get_filelist(htmlfile, opts) if htmlfile.lower().endswith('.opf'):
mi = merge_metadata(htmlfile, opf, opts) opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
mi = MetaInformation(opf)
else:
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
opts.chapter = XPath(opts.chapter, opts.chapter = XPath(opts.chapter,
namespaces={'re':'http://exslt.org/regular-expressions'}) namespaces={'re':'http://exslt.org/regular-expressions'})
resource_map = parse_content(filelist, opts) with TemporaryDirectory('_html2epub') as tdir:
resource_map, htmlfile_map = parse_content(filelist, opts, tdir)
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] if opf.cover and os.access(opf.cover, os.R_OK):
shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
shutil.copyfile(opf.cover, cpath)
resources.append(cpath)
mi.cover = cpath
if opf.cover and os.access(opf.cover, os.R_OK): spine = [htmlfile_map[f.path] for f in filelist]
shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) mi = create_metadata(tdir, mi, spine, resources)
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) buf = cStringIO.StringIO()
shutil.copyfile(opf.cover, cpath) if mi.toc:
resources.append(cpath) rebase_toc(mi.toc, htmlfile_map, opts.output)
mi.cover = cpath with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
mi.render(f, buf)
toc = buf.getvalue()
if toc:
with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
f.write(toc)
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()

View File

@ -1,10 +1,14 @@
from __future__ import with_statement from __future__ import with_statement
import cStringIO
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, re, os, shutil, logging, tempfile '''
Code to recursively parse HTML files and create an open ebook in a specified
directory or zip file. All the action starts in :function:`create_dir`.
'''
import sys, re, os, shutil, logging, tempfile, cStringIO
from urlparse import urlparse from urlparse import urlparse
from urllib import unquote from urllib import unquote
@ -445,10 +449,10 @@ class Parser(PreProcessor, LoggingInterface):
self.raw_css = '\n\n'.join(css) self.raw_css = '\n\n'.join(css)
# TODO: Figure out what to do about CSS imports from linked stylesheets # TODO: Figure out what to do about CSS imports from linked stylesheets
def config(defaults=None): def config(defaults=None, config_name='html',
desc = _('Options to control the traversal of HTML') desc=_('Options to control the traversal of HTML')):
if defaults is None: if defaults is None:
c = Config('html', desc) c = Config(config_name, desc)
else: else:
c = StringConfig(defaults, desc) c = StringConfig(defaults, desc)
@ -482,10 +486,12 @@ def config(defaults=None):
def option_parser(): def option_parser():
c = config() c = config()
return c.option_parser(usage=_('''\ return c.option_parser(usage=_('''\
%prog [options] file.html %prog [options] file.html|opf
Follow all links in an HTML file and collect them into the specified directory. Follow all links in an HTML file and collect them into the specified directory.
Also collects any references resources like images, stylesheets, scripts, etc. Also collects any references resources like images, stylesheets, scripts, etc.
If an OPF file is specified instead, the list of files in its <spine> element
is used.
''')) '''))
def search_for_opf(dir): def search_for_opf(dir):
@ -566,7 +572,8 @@ def create_metadata(basepath, mi, filelist, resources):
def rebase_toc(toc, htmlfile_map, basepath, root=True): def rebase_toc(toc, htmlfile_map, basepath, root=True):
''' '''
Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Maps all entries
in the TOC to point to their new locations relative to the new OPF file.
''' '''
def fix_entry(entry): def fix_entry(entry):
if entry.abspath in htmlfile_map.keys(): if entry.abspath in htmlfile_map.keys():
@ -582,15 +589,23 @@ def create_dir(htmlfile, opts):
''' '''
Create a directory that contains the open ebook Create a directory that contains the open ebook
''' '''
opf, filelist = get_filelist(htmlfile, opts) if htmlfile.lower().endswith('.opf'):
mi = merge_metadata(htmlfile, opf, opts) opf = OPFReader(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile)))
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
mi = MetaInformation(opf)
else:
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
resource_map, htmlfile_map = parse_content(filelist, opts) resource_map, htmlfile_map = parse_content(filelist, opts)
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
if opf and opf.cover and os.access(opf.cover, os.R_OK): if opf and opf.cover and os.access(opf.cover, os.R_OK):
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
shutil.copyfile(opf.cover, cpath) shutil.copyfile(opf.cover, cpath)
resources.append(cpath) resources.append(cpath)
mi.cover = cpath mi.cover = cpath
spine = [htmlfile_map[f.path] for f in filelist] spine = [htmlfile_map[f.path] for f in filelist]
mi = create_metadata(opts.output, mi, spine, resources) mi = create_metadata(opts.output, mi, spine, resources)
buf = cStringIO.StringIO() buf = cStringIO.StringIO()

View File

@ -105,7 +105,6 @@ def set_metadata(stream, mi):
reader.opf.smart_update(mi) reader.opf.smart_update(mi)
newopf = StringIO(reader.opf.render()) newopf = StringIO(reader.opf.render())
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf) safe_replace(stream, reader.container[OPF.MIMETYPE], newopf)
print newopf.getvalue()
def option_parser(): def option_parser():
parser = get_parser('epub') parser = get_parser('epub')

View File

@ -150,7 +150,7 @@ class OPF(object):
def fset(self, val): def fset(self, val):
matches = self.isbn_path(self.tree) matches = self.isbn_path(self.tree)
if not matches: if not matches:
matches = [self.create_metadata_element('dc:identifier', matches = [self.create_metadata_element('identifier', ns='dc',
attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})] attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})]
matches[0].text = unicode(val) matches[0].text = unicode(val)
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)

View File

@ -920,8 +920,8 @@ def worker(host, port):
msg = 'ERROR:'+cPickle.dumps((exception, tb),-1) msg = 'ERROR:'+cPickle.dumps((exception, tb),-1)
write(client_socket, msg) write(client_socket, msg)
res = read(client_socket, 10) res = read(client_socket, 10)
if res != 'OK': if res != 'OK':
break break
gc.collect() gc.collect()
elif msg == 'PING:': elif msg == 'PING:':
write(client_socket, 'OK') write(client_socket, 'OK')