diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index c41e33ad50..152c58502f 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -289,7 +289,7 @@ class LoggingInterface: formatter = logging.Formatter() level = logging.INFO if verbosity > 0: - formatter = ColoredFormatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s') if verbosity > 1 else \ + formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \ ColoredFormatter('%(levelname)s: %(message)s') level = logging.DEBUG if verbosity > 1: diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 4f372b85a0..d4d2240bca 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -74,7 +74,7 @@ MAP = { 'txt' : txt2opf, 'pdf' : pdf2opf, } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip'] +SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'fb2', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'htm', 'html'] def unarchive(path, tdir): extract(path, tdir) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index ef90128caf..dc4489d67e 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -149,7 +149,7 @@ def convert(htmlfile, opts, notification=None): buf = cStringIO.StringIO() if mi.toc: rebase_toc(mi.toc, htmlfile_map, tdir) - if opts.use_auto_toc or mi.toc is None or len(mi.toc) < 2: + if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: mi.toc = generated_toc for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': @@ -184,4 +184,4 @@ def main(args=sys.argv): return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 8ab0fec437..4e17d19a34 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -48,8 +48,14 @@ class Splitter(LoggingInterface): css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root) if css: cssp = os.path.join('content', *(css[0].get('href').split('/'))) - self.log_debug('\t\tParsing stylesheet...') - stylesheet = CSSParser().parseString(open(cssp, 'rb').read()) + self.log_debug('\t\tParsing stylesheet...') + try: + stylesheet = CSSParser().parseString(open(cssp, 'rb').read()) + except: + self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled') + if self.opts.verbose > 1: + self.log_exception('') + stylesheet = None else: stylesheet = None self.page_breaks = [] diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 713322e8c7..38f2157b07 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -123,6 +123,8 @@ class HTMLFile(object): if encoding is None: encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] self.encoding = encoding + else: + self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) @@ -200,6 +202,8 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) continue try: nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) + if nf.is_binary: + raise IgnoreFile('%s is a binary file'%nf.path, -1) nl.append(nf) flat.append(nf) except IgnoreFile, err: @@ -235,7 +239,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): if path not in flat: flat.append(path) flat = [HTMLFile(path, 0, encoding, verbose) for path in flat] - return flat + return [f for f in flat if not f.is_binary] @@ -521,8 +525,6 @@ class Processor(Parser): Remove all CSS information from the document and store in self.raw_css. This includes tags. ''' - counter = 0 - def get_id(chapter, counter, prefix='calibre_css_'): new_id = '%s_%d'%(prefix, counter) if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): @@ -667,7 +669,10 @@ def get_filelist(htmlfile, opts): opf = search_for_opf(dir) filelist = None if opf is not None: - filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + try: + filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + except: + pass if not filelist: filelist = traverse(htmlfile, max_levels=int(opts.max_levels), verbose=opts.verbose, encoding=opts.encoding)\ diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 19b49eda40..bed50876fe 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en' lxml based OPF parser. ''' -import sys, unittest, functools, os, mimetypes, uuid +import sys, unittest, functools, os, mimetypes, uuid, glob from urllib import unquote from urlparse import urlparse @@ -446,10 +446,43 @@ class OPF(object): self.spine = Spine.from_opf_spine_element(s, self.manifest) self.guide = None guide = self.guide_path(self.root) - if guide: - self.guide = Guide.from_opf_guide(guide, basedir) + self.guide = Guide.from_opf_guide(guide, basedir) if guide else None self.cover_data = (None, None) + self.find_toc() + def find_toc(self): + self.toc = None + try: + spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root) + toc = None + if spine: + spine = spine[0] + toc = spine.get('toc', None) + if toc is None and self.guide: + for item in self.guide: + if item.type and item.type.lower() == 'toc': + toc = item.path + if toc is None: + for item in self.manifest: + if 'toc' in item.href().lower(): + toc = item.path + + if toc is None: return + self.toc = TOC(base_path=self.base_dir) + if toc.lower() in ('ncx', 'ncxtoc'): + path = self.manifest.path_for_id(toc) + if path: + self.toc.read_ncx_toc(path) + else: + f = glob.glob(os.path.join(self.base_dir, '*.ncx')) + if f: + self.toc.read_ncx_toc(f[0]) + else: + self.toc.read_html_toc(toc) + except: + pass + + def get_text(self, elem): return u''.join(self.TEXT(elem))