IGN:Improved TOC detection for html2epub and miscellaneous bug fixes

This commit is contained in:
Kovid Goyal 2008-09-23 15:19:29 -07:00
parent 8f090b2d31
commit ba38ad39bf
6 changed files with 57 additions and 13 deletions

View File

@ -289,7 +289,7 @@ class LoggingInterface:
formatter = logging.Formatter() formatter = logging.Formatter()
level = logging.INFO level = logging.INFO
if verbosity > 0: if verbosity > 0:
formatter = ColoredFormatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s') if verbosity > 1 else \ formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
ColoredFormatter('%(levelname)s: %(message)s') ColoredFormatter('%(levelname)s: %(message)s')
level = logging.DEBUG level = logging.DEBUG
if verbosity > 1: if verbosity > 1:

View File

@ -74,7 +74,7 @@ MAP = {
'txt' : txt2opf, 'txt' : txt2opf,
'pdf' : pdf2opf, 'pdf' : pdf2opf,
} }
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip'] SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'htm', 'html']
def unarchive(path, tdir): def unarchive(path, tdir):
extract(path, tdir) extract(path, tdir)

View File

@ -149,7 +149,7 @@ def convert(htmlfile, opts, notification=None):
buf = cStringIO.StringIO() buf = cStringIO.StringIO()
if mi.toc: if mi.toc:
rebase_toc(mi.toc, htmlfile_map, tdir) rebase_toc(mi.toc, htmlfile_map, tdir)
if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2: if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
mi.toc = generated_toc mi.toc = generated_toc
for item in mi.manifest: for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html': if getattr(item, 'mime_type', None) == 'text/html':
@ -184,4 +184,4 @@ def main(args=sys.argv):
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -48,8 +48,14 @@ class Splitter(LoggingInterface):
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root) css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
if css: if css:
cssp = os.path.join('content', *(css[0].get('href').split('/'))) cssp = os.path.join('content', *(css[0].get('href').split('/')))
self.log_debug('\t\tParsing stylesheet...') self.log_debug('\t\tParsing stylesheet...')
stylesheet = CSSParser().parseString(open(cssp, 'rb').read()) try:
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
except:
self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
if self.opts.verbose > 1:
self.log_exception('')
stylesheet = None
else: else:
stylesheet = None stylesheet = None
self.page_breaks = [] self.page_breaks = []

View File

@ -123,6 +123,8 @@ class HTMLFile(object):
if encoding is None: if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding self.encoding = encoding
else:
self.encoding = encoding
src = src.decode(encoding, 'replace') src = src.decode(encoding, 'replace')
match = self.TITLE_PAT.search(src) match = self.TITLE_PAT.search(src)
@ -200,6 +202,8 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
continue continue
try: try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf) nl.append(nf)
flat.append(nf) flat.append(nf)
except IgnoreFile, err: except IgnoreFile, err:
@ -235,7 +239,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
if path not in flat: if path not in flat:
flat.append(path) flat.append(path)
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat] flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
return flat return [f for f in flat if not f.is_binary]
@ -521,8 +525,6 @@ class Processor(Parser):
Remove all CSS information from the document and store in self.raw_css. Remove all CSS information from the document and store in self.raw_css.
This includes <font> tags. This includes <font> tags.
''' '''
counter = 0
def get_id(chapter, counter, prefix='calibre_css_'): def get_id(chapter, counter, prefix='calibre_css_'):
new_id = '%s_%d'%(prefix, counter) new_id = '%s_%d'%(prefix, counter)
if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
@ -667,7 +669,10 @@ def get_filelist(htmlfile, opts):
opf = search_for_opf(dir) opf = search_for_opf(dir)
filelist = None filelist = None
if opf is not None: if opf is not None:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) try:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
except:
pass
if not filelist: if not filelist:
filelist = traverse(htmlfile, max_levels=int(opts.max_levels), filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose, encoding=opts.encoding)\ verbose=opts.verbose, encoding=opts.encoding)\

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
lxml based OPF parser. lxml based OPF parser.
''' '''
import sys, unittest, functools, os, mimetypes, uuid import sys, unittest, functools, os, mimetypes, uuid, glob
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
@ -446,10 +446,43 @@ class OPF(object):
self.spine = Spine.from_opf_spine_element(s, self.manifest) self.spine = Spine.from_opf_spine_element(s, self.manifest)
self.guide = None self.guide = None
guide = self.guide_path(self.root) guide = self.guide_path(self.root)
if guide: self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
self.guide = Guide.from_opf_guide(guide, basedir)
self.cover_data = (None, None) self.cover_data = (None, None)
self.find_toc()
def find_toc(self):
self.toc = None
try:
spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root)
toc = None
if spine:
spine = spine[0]
toc = spine.get('toc', None)
if toc is None and self.guide:
for item in self.guide:
if item.type and item.type.lower() == 'toc':
toc = item.path
if toc is None:
for item in self.manifest:
if 'toc' in item.href().lower():
toc = item.path
if toc is None: return
self.toc = TOC(base_path=self.base_dir)
if toc.lower() in ('ncx', 'ncxtoc'):
path = self.manifest.path_for_id(toc)
if path:
self.toc.read_ncx_toc(path)
else:
f = glob.glob(os.path.join(self.base_dir, '*.ncx'))
if f:
self.toc.read_ncx_toc(f[0])
else:
self.toc.read_html_toc(toc)
except:
pass
def get_text(self, elem): def get_text(self, elem):
return u''.join(self.TEXT(elem)) return u''.join(self.TEXT(elem))