IGN:Improved TOC detection for html2epub and miscellaneous bug fixes

This commit is contained in:
Kovid Goyal 2008-09-23 15:19:29 -07:00
parent 8f090b2d31
commit ba38ad39bf
6 changed files with 57 additions and 13 deletions

View File

@ -289,7 +289,7 @@ class LoggingInterface:
formatter = logging.Formatter()
level = logging.INFO
if verbosity > 0:
formatter = ColoredFormatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s') if verbosity > 1 else \
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
ColoredFormatter('%(levelname)s: %(message)s')
level = logging.DEBUG
if verbosity > 1:

View File

@ -74,7 +74,7 @@ MAP = {
'txt' : txt2opf,
'pdf' : pdf2opf,
}
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip']
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'htm', 'html']
def unarchive(path, tdir):
extract(path, tdir)

View File

@ -149,7 +149,7 @@ def convert(htmlfile, opts, notification=None):
buf = cStringIO.StringIO()
if mi.toc:
rebase_toc(mi.toc, htmlfile_map, tdir)
if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2:
if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
mi.toc = generated_toc
for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html':
@ -184,4 +184,4 @@ def main(args=sys.argv):
return 0
if __name__ == '__main__':
sys.exit(main())
sys.exit(main())

View File

@ -48,8 +48,14 @@ class Splitter(LoggingInterface):
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
if css:
cssp = os.path.join('content', *(css[0].get('href').split('/')))
self.log_debug('\t\tParsing stylesheet...')
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
self.log_debug('\t\tParsing stylesheet...')
try:
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
except:
self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
if self.opts.verbose > 1:
self.log_exception('')
stylesheet = None
else:
stylesheet = None
self.page_breaks = []

View File

@ -123,6 +123,8 @@ class HTMLFile(object):
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
else:
self.encoding = encoding
src = src.decode(encoding, 'replace')
match = self.TITLE_PAT.search(src)
@ -200,6 +202,8 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
@ -235,7 +239,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
if path not in flat:
flat.append(path)
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
return flat
return [f for f in flat if not f.is_binary]
@ -521,8 +525,6 @@ class Processor(Parser):
Remove all CSS information from the document and store in self.raw_css.
This includes <font> tags.
'''
counter = 0
def get_id(chapter, counter, prefix='calibre_css_'):
new_id = '%s_%d'%(prefix, counter)
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
@ -667,7 +669,10 @@ def get_filelist(htmlfile, opts):
opf = search_for_opf(dir)
filelist = None
if opf is not None:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
try:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
except:
pass
if not filelist:
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose, encoding=opts.encoding)\

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
lxml based OPF parser.
'''
import sys, unittest, functools, os, mimetypes, uuid
import sys, unittest, functools, os, mimetypes, uuid, glob
from urllib import unquote
from urlparse import urlparse
@ -446,10 +446,43 @@ class OPF(object):
self.spine = Spine.from_opf_spine_element(s, self.manifest)
self.guide = None
guide = self.guide_path(self.root)
if guide:
self.guide = Guide.from_opf_guide(guide, basedir)
self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
self.cover_data = (None, None)
self.find_toc()
def find_toc(self):
self.toc = None
try:
spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root)
toc = None
if spine:
spine = spine[0]
toc = spine.get('toc', None)
if toc is None and self.guide:
for item in self.guide:
if item.type and item.type.lower() == 'toc':
toc = item.path
if toc is None:
for item in self.manifest:
if 'toc' in item.href().lower():
toc = item.path
if toc is None: return
self.toc = TOC(base_path=self.base_dir)
if toc.lower() in ('ncx', 'ncxtoc'):
path = self.manifest.path_for_id(toc)
if path:
self.toc.read_ncx_toc(path)
else:
f = glob.glob(os.path.join(self.base_dir, '*.ncx'))
if f:
self.toc.read_ncx_toc(f[0])
else:
self.toc.read_html_toc(toc)
except:
pass
def get_text(self, elem):
return u''.join(self.TEXT(elem))