mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Improved TOC detection for html2epub and miscellaneous bug fixes
This commit is contained in:
parent
8f090b2d31
commit
ba38ad39bf
@ -289,7 +289,7 @@ class LoggingInterface:
|
|||||||
formatter = logging.Formatter()
|
formatter = logging.Formatter()
|
||||||
level = logging.INFO
|
level = logging.INFO
|
||||||
if verbosity > 0:
|
if verbosity > 0:
|
||||||
formatter = ColoredFormatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s') if verbosity > 1 else \
|
formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \
|
||||||
ColoredFormatter('%(levelname)s: %(message)s')
|
ColoredFormatter('%(levelname)s: %(message)s')
|
||||||
level = logging.DEBUG
|
level = logging.DEBUG
|
||||||
if verbosity > 1:
|
if verbosity > 1:
|
||||||
|
@ -74,7 +74,7 @@ MAP = {
|
|||||||
'txt' : txt2opf,
|
'txt' : txt2opf,
|
||||||
'pdf' : pdf2opf,
|
'pdf' : pdf2opf,
|
||||||
}
|
}
|
||||||
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip']
|
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'htm', 'html']
|
||||||
|
|
||||||
def unarchive(path, tdir):
|
def unarchive(path, tdir):
|
||||||
extract(path, tdir)
|
extract(path, tdir)
|
||||||
|
@ -149,7 +149,7 @@ def convert(htmlfile, opts, notification=None):
|
|||||||
buf = cStringIO.StringIO()
|
buf = cStringIO.StringIO()
|
||||||
if mi.toc:
|
if mi.toc:
|
||||||
rebase_toc(mi.toc, htmlfile_map, tdir)
|
rebase_toc(mi.toc, htmlfile_map, tdir)
|
||||||
if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2:
|
if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
|
||||||
mi.toc = generated_toc
|
mi.toc = generated_toc
|
||||||
for item in mi.manifest:
|
for item in mi.manifest:
|
||||||
if getattr(item, 'mime_type', None) == 'text/html':
|
if getattr(item, 'mime_type', None) == 'text/html':
|
||||||
@ -184,4 +184,4 @@ def main(args=sys.argv):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
@ -48,8 +48,14 @@ class Splitter(LoggingInterface):
|
|||||||
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
|
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
|
||||||
if css:
|
if css:
|
||||||
cssp = os.path.join('content', *(css[0].get('href').split('/')))
|
cssp = os.path.join('content', *(css[0].get('href').split('/')))
|
||||||
self.log_debug('\t\tParsing stylesheet...')
|
self.log_debug('\t\tParsing stylesheet...')
|
||||||
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
|
try:
|
||||||
|
stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
|
||||||
|
except:
|
||||||
|
self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
|
||||||
|
if self.opts.verbose > 1:
|
||||||
|
self.log_exception('')
|
||||||
|
stylesheet = None
|
||||||
else:
|
else:
|
||||||
stylesheet = None
|
stylesheet = None
|
||||||
self.page_breaks = []
|
self.page_breaks = []
|
||||||
|
@ -123,6 +123,8 @@ class HTMLFile(object):
|
|||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
else:
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
src = src.decode(encoding, 'replace')
|
src = src.decode(encoding, 'replace')
|
||||||
match = self.TITLE_PAT.search(src)
|
match = self.TITLE_PAT.search(src)
|
||||||
@ -200,6 +202,8 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
|||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||||
|
if nf.is_binary:
|
||||||
|
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||||
nl.append(nf)
|
nl.append(nf)
|
||||||
flat.append(nf)
|
flat.append(nf)
|
||||||
except IgnoreFile, err:
|
except IgnoreFile, err:
|
||||||
@ -235,7 +239,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
|
|||||||
if path not in flat:
|
if path not in flat:
|
||||||
flat.append(path)
|
flat.append(path)
|
||||||
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
|
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
|
||||||
return flat
|
return [f for f in flat if not f.is_binary]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -521,8 +525,6 @@ class Processor(Parser):
|
|||||||
Remove all CSS information from the document and store in self.raw_css.
|
Remove all CSS information from the document and store in self.raw_css.
|
||||||
This includes <font> tags.
|
This includes <font> tags.
|
||||||
'''
|
'''
|
||||||
counter = 0
|
|
||||||
|
|
||||||
def get_id(chapter, counter, prefix='calibre_css_'):
|
def get_id(chapter, counter, prefix='calibre_css_'):
|
||||||
new_id = '%s_%d'%(prefix, counter)
|
new_id = '%s_%d'%(prefix, counter)
|
||||||
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
|
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
|
||||||
@ -667,7 +669,10 @@ def get_filelist(htmlfile, opts):
|
|||||||
opf = search_for_opf(dir)
|
opf = search_for_opf(dir)
|
||||||
filelist = None
|
filelist = None
|
||||||
if opf is not None:
|
if opf is not None:
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
try:
|
||||||
|
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if not filelist:
|
if not filelist:
|
||||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
verbose=opts.verbose, encoding=opts.encoding)\
|
verbose=opts.verbose, encoding=opts.encoding)\
|
||||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
lxml based OPF parser.
|
lxml based OPF parser.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys, unittest, functools, os, mimetypes, uuid
|
import sys, unittest, functools, os, mimetypes, uuid, glob
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
|
||||||
@ -446,10 +446,43 @@ class OPF(object):
|
|||||||
self.spine = Spine.from_opf_spine_element(s, self.manifest)
|
self.spine = Spine.from_opf_spine_element(s, self.manifest)
|
||||||
self.guide = None
|
self.guide = None
|
||||||
guide = self.guide_path(self.root)
|
guide = self.guide_path(self.root)
|
||||||
if guide:
|
self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
|
||||||
self.guide = Guide.from_opf_guide(guide, basedir)
|
|
||||||
self.cover_data = (None, None)
|
self.cover_data = (None, None)
|
||||||
|
self.find_toc()
|
||||||
|
|
||||||
|
def find_toc(self):
|
||||||
|
self.toc = None
|
||||||
|
try:
|
||||||
|
spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root)
|
||||||
|
toc = None
|
||||||
|
if spine:
|
||||||
|
spine = spine[0]
|
||||||
|
toc = spine.get('toc', None)
|
||||||
|
if toc is None and self.guide:
|
||||||
|
for item in self.guide:
|
||||||
|
if item.type and item.type.lower() == 'toc':
|
||||||
|
toc = item.path
|
||||||
|
if toc is None:
|
||||||
|
for item in self.manifest:
|
||||||
|
if 'toc' in item.href().lower():
|
||||||
|
toc = item.path
|
||||||
|
|
||||||
|
if toc is None: return
|
||||||
|
self.toc = TOC(base_path=self.base_dir)
|
||||||
|
if toc.lower() in ('ncx', 'ncxtoc'):
|
||||||
|
path = self.manifest.path_for_id(toc)
|
||||||
|
if path:
|
||||||
|
self.toc.read_ncx_toc(path)
|
||||||
|
else:
|
||||||
|
f = glob.glob(os.path.join(self.base_dir, '*.ncx'))
|
||||||
|
if f:
|
||||||
|
self.toc.read_ncx_toc(f[0])
|
||||||
|
else:
|
||||||
|
self.toc.read_html_toc(toc)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_text(self, elem):
|
def get_text(self, elem):
|
||||||
return u''.join(self.TEXT(elem))
|
return u''.join(self.TEXT(elem))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user