From ed7459e2cea183cba8af9664cfffe99b40987669 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 11 Jul 2011 21:47:08 -0400
Subject: [PATCH] HTMLZ Input: Do a better job of detecting index HTML file in
 archive. Warn user if multiple HTML files are found. Ensure index file is not
 empty. Only load top level index file.

---
 src/calibre/ebooks/htmlz/input.py | 46 +++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/htmlz/input.py
index 3cf95b8a48..f0f45f72fe 100644
--- a/src/calibre/ebooks/htmlz/input.py
+++ b/src/calibre/ebooks/htmlz/input.py
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 
 import os
 
-from calibre import guess_type, walk
+from calibre import guess_type
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata.opf2 import OPF
@@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin):
                 accelerators):
         self.log = log
         html = u''
+        top_levels = []
 
         # Extract content from zip archive.
         zf = ZipFile(stream)
         zf.extractall()
 
-        for x in walk('.'):
+        # Find the HTML file in the archive. It needs to be
+        # top level.
+        index = u''
+        multiple_html = False
+        # Get a list of all top level files in the archive.
+        for x in os.listdir('.'):
+            if os.path.isfile(x):
+                top_levels.append(x)
+        # Try to find an index. file.
+        for x in top_levels:
+            if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
+                index = x
+                break
+        # Look for multiple HTML files in the archive. We look at the
+        # top level files only as only they matter in HTMLZ.
+        for x in top_levels:
             if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
-                with open(x, 'rb') as tf:
-                    html = tf.read()
-                    break
+                # Set index to the first HTML file found if it's not
+                # called index.
+                if not index:
+                    index = x
+                else:
+                    multiple_html = True
+        # Warn the user if there multiple HTML file in the archive. HTMLZ
+        # supports a single HTML file. A conversion with a multiple HTML file
+        # HTMLZ archive probably won't turn out as the user expects. With
+        # Multiple HTML files ZIP input should be used in place of HTMLZ.
+        if multiple_html:
+            log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
+
+        if index:
+            with open(index, 'rb') as tf:
+                html = tf.read()
+        else:
+            raise Exception(_('No top level HTML file found.'))
+
+        if not html:
+            raise Exception(_('Top level HTML file %s is empty') % index)
 
         # Encoding
         if options.input_encoding:
@@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin):
         # Get the cover path from the OPF.
         cover_path = None
         opf = None
-        for x in walk('.'):
+        for x in top_levels:
             if os.path.splitext(x)[1].lower() in ('.opf'):
                 opf = x
                 break