From ed7459e2cea183cba8af9664cfffe99b40987669 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 11 Jul 2011 21:47:08 -0400 Subject: [PATCH] HTMLZ Input: Do a better job of detecting index HTML file in archive. Warn user if multiple HTML files are found. Ensure index file is not empty. Only load top level index file. --- src/calibre/ebooks/htmlz/input.py | 46 +++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/htmlz/input.py index 3cf95b8a48..f0f45f72fe 100644 --- a/src/calibre/ebooks/htmlz/input.py +++ b/src/calibre/ebooks/htmlz/input.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre import guess_type, walk +from calibre import guess_type from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF @@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin): accelerators): self.log = log html = u'' + top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() - for x in walk('.'): + # Find the HTML file in the archive. It needs to be + # top level. + index = u'' + multiple_html = False + # Get a list of all top level files in the archive. + for x in os.listdir('.'): + if os.path.isfile(x): + top_levels.append(x) + # Try to find an index. file. + for x in top_levels: + if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): + index = x + break + # Look for multiple HTML files in the archive. We look at the + # top level files only as only they matter in HTMLZ. + for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): - with open(x, 'rb') as tf: - html = tf.read() - break + # Set index to the first HTML file found if it's not + # called index. + if not index: + index = x + else: + multiple_html = True + # Warn the user if there multiple HTML file in the archive. HTMLZ + # supports a single HTML file. A conversion with a multiple HTML file + # HTMLZ archive probably won't turn out as the user expects. With + # Multiple HTML files ZIP input should be used in place of HTMLZ. + if multiple_html: + log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index) + + if index: + with open(index, 'rb') as tf: + html = tf.read() + else: + raise Exception(_('No top level HTML file found.')) + + if not html: + raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: @@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin): # Get the cover path from the OPF. cover_path = None opf = None - for x in walk('.'): + for x in top_levels: if os.path.splitext(x)[1].lower() in ('.opf'): opf = x break