diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 6ebf86e81a..dd79901e90 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -13,7 +13,7 @@ from contextlib import nested from calibre import extract, walk from calibre.ebooks import DRMError from calibre.ebooks.epub import config as common_config -from calibre.ebooks.epub.from_html import convert as html2epub +from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator @@ -103,18 +103,7 @@ def unarchive(path, tdir): if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: continue return f, ext - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) - html_files = [f for f in files if html_pat.search(f) is not None] - if not html_files: - raise ValueError(_('Could not find an ebook inside the archive')) - html_files = [(f, os.stat(f).st_size) for f in html_files] - html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) - html_files = [f[0] for f in html_files] - for q in ('toc', 'index'): - for f in html_files: - if os.path.splitext(f)[0].lower() == q: - return f, os.path.splitext(f)[1].lower()[1:] - return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] + return find_html_index(files) def any2epub(opts, path, notification=None): ext = os.path.splitext(path)[1] diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 54f91e9f0f..586b37d70c 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -32,7 +32,7 @@ Conversion of HTML/OPF files follows several stages: * The EPUB container is created. ''' -import os, sys, cStringIO, logging +import os, sys, cStringIO, logging, re from lxml.etree import XPath try: @@ -51,7 +51,25 @@ from calibre.ebooks.epub import initialize_container, PROFILES from calibre.ebooks.epub.split import split from calibre.ebooks.epub.fonts import Rationalizer from calibre.constants import preferred_encoding +from calibre import walk +def find_html_index(files): + ''' + Given a list of files, find the most likely root HTML file in the + list. + ''' + html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) + html_files = [f for f in files if html_pat.search(f) is not None] + if not html_files: + raise ValueError(_('Could not find an ebook inside the archive')) + html_files = [(f, os.stat(f).st_size) for f in html_files] + html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) + html_files = [f[0] for f in html_files] + for q in ('toc', 'index'): + for f in html_files: + if os.path.splitext(f)[0].lower() == q: + return f, os.path.splitext(f)[1].lower()[1:] + return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] class HTMLProcessor(Processor, Rationalizer): @@ -203,6 +221,10 @@ def convert(htmlfile, opts, notification=None): if htmlfile.lower().endswith('.opf'): opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + if not filelist: + # Bad OPF look for a HTML file instead + htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0] + filelist = get_filelist(htmlfile, opts)[1] mi = MetaInformation(opf) else: opf, filelist = get_filelist(htmlfile, opts) diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 350c535d05..7a1ef56ad0 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -304,8 +304,14 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): flat[i] = path for item in opf_reader.itermanifest(): item.set('href', item.get('href').replace('&', '%26')) - flat = [HTMLFile(path, 0, encoding, verbose) for path in flat] - return [f for f in flat if not f.is_binary] + ans = [] + for path in flat: + if os.path.exists(path): + ans.append(HTMLFile(path, 0, encoding, verbose)) + else: + print 'WARNING: OPF spine item %s does not exist'%path + ans = [f for f in ans if not f.is_binary] + return ans