mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Improved TOC detection for html2epub and miscellaneous bug fixes
This commit is contained in:
parent
8f090b2d31
commit
ba38ad39bf
@ -289,7 +289,7 @@ class LoggingInterface:
|
||||
formatter = logging.Formatter()
|
||||
level = logging.INFO
|
||||
if verbosity > 0:
|
||||
formatter = ColoredFormatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s') if verbosity > 1 else \
|
||||
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
|
||||
ColoredFormatter('%(levelname)s: %(message)s')
|
||||
level = logging.DEBUG
|
||||
if verbosity > 1:
|
||||
|
@ -74,7 +74,7 @@ MAP = {
|
||||
'txt' : txt2opf,
|
||||
'pdf' : pdf2opf,
|
||||
}
|
||||
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip']
|
||||
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'htm', 'html']
|
||||
|
||||
def unarchive(path, tdir):
|
||||
extract(path, tdir)
|
||||
|
@ -149,7 +149,7 @@ def convert(htmlfile, opts, notification=None):
|
||||
buf = cStringIO.StringIO()
|
||||
if mi.toc:
|
||||
rebase_toc(mi.toc, htmlfile_map, tdir)
|
||||
if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2:
|
||||
if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
|
||||
mi.toc = generated_toc
|
||||
for item in mi.manifest:
|
||||
if getattr(item, 'mime_type', None) == 'text/html':
|
||||
@ -184,4 +184,4 @@ def main(args=sys.argv):
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
|
@ -48,8 +48,14 @@ class Splitter(LoggingInterface):
|
||||
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
|
||||
if css:
|
||||
cssp = os.path.join('content', *(css[0].get('href').split('/')))
|
||||
self.log_debug('\t\tParsing stylesheet...')
|
||||
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
|
||||
self.log_debug('\t\tParsing stylesheet...')
|
||||
try:
|
||||
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
|
||||
except:
|
||||
self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
|
||||
if self.opts.verbose > 1:
|
||||
self.log_exception('')
|
||||
stylesheet = None
|
||||
else:
|
||||
stylesheet = None
|
||||
self.page_breaks = []
|
||||
|
@ -123,6 +123,8 @@ class HTMLFile(object):
|
||||
if encoding is None:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
@ -200,6 +202,8 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile, err:
|
||||
@ -235,7 +239,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
if path not in flat:
|
||||
flat.append(path)
|
||||
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
|
||||
return flat
|
||||
return [f for f in flat if not f.is_binary]
|
||||
|
||||
|
||||
|
||||
@ -521,8 +525,6 @@ class Processor(Parser):
|
||||
Remove all CSS information from the document and store in self.raw_css.
|
||||
This includes <font> tags.
|
||||
'''
|
||||
counter = 0
|
||||
|
||||
def get_id(chapter, counter, prefix='calibre_css_'):
|
||||
new_id = '%s_%d'%(prefix, counter)
|
||||
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
|
||||
@ -667,7 +669,10 @@ def get_filelist(htmlfile, opts):
|
||||
opf = search_for_opf(dir)
|
||||
filelist = None
|
||||
if opf is not None:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||
try:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||
except:
|
||||
pass
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose, encoding=opts.encoding)\
|
||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
lxml based OPF parser.
|
||||
'''
|
||||
|
||||
import sys, unittest, functools, os, mimetypes, uuid
|
||||
import sys, unittest, functools, os, mimetypes, uuid, glob
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse
|
||||
|
||||
@ -446,10 +446,43 @@ class OPF(object):
|
||||
self.spine = Spine.from_opf_spine_element(s, self.manifest)
|
||||
self.guide = None
|
||||
guide = self.guide_path(self.root)
|
||||
if guide:
|
||||
self.guide = Guide.from_opf_guide(guide, basedir)
|
||||
self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
|
||||
self.cover_data = (None, None)
|
||||
self.find_toc()
|
||||
|
||||
def find_toc(self):
|
||||
self.toc = None
|
||||
try:
|
||||
spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root)
|
||||
toc = None
|
||||
if spine:
|
||||
spine = spine[0]
|
||||
toc = spine.get('toc', None)
|
||||
if toc is None and self.guide:
|
||||
for item in self.guide:
|
||||
if item.type and item.type.lower() == 'toc':
|
||||
toc = item.path
|
||||
if toc is None:
|
||||
for item in self.manifest:
|
||||
if 'toc' in item.href().lower():
|
||||
toc = item.path
|
||||
|
||||
if toc is None: return
|
||||
self.toc = TOC(base_path=self.base_dir)
|
||||
if toc.lower() in ('ncx', 'ncxtoc'):
|
||||
path = self.manifest.path_for_id(toc)
|
||||
if path:
|
||||
self.toc.read_ncx_toc(path)
|
||||
else:
|
||||
f = glob.glob(os.path.join(self.base_dir, '*.ncx'))
|
||||
if f:
|
||||
self.toc.read_ncx_toc(f[0])
|
||||
else:
|
||||
self.toc.read_html_toc(toc)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
def get_text(self, elem):
|
||||
return u''.join(self.TEXT(elem))
|
||||
|
Loading…
x
Reference in New Issue
Block a user