HTMLZ Input: Do a better job of detecting index HTML file in archive. Warn user if multiple HTML files are found. Ensure index file is not empty. Only load top level index file.

This commit is contained in:
John Schember 2011-07-11 21:47:08 -04:00
parent 68778be31d
commit ed7459e2ce

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre import guess_type, walk
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin):
accelerators):
self.log = log
html = u''
top_levels = []
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall()
for x in walk('.'):
# Find the HTML file in the archive. It needs to be
# top level.
index = u''
multiple_html = False
# Get a list of all top level files in the archive.
for x in os.listdir('.'):
if os.path.isfile(x):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
with open(x, 'rb') as tf:
html = tf.read()
break
# Set index to the first HTML file found if it's not
# called index.
if not index:
index = x
else:
multiple_html = True
# Warn the user if there multiple HTML file in the archive. HTMLZ
# supports a single HTML file. A conversion with a multiple HTML file
# HTMLZ archive probably won't turn out as the user expects. With
# Multiple HTML files ZIP input should be used in place of HTMLZ.
if multiple_html:
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
if index:
with open(index, 'rb') as tf:
html = tf.read()
else:
raise Exception(_('No top level HTML file found.'))
if not html:
raise Exception(_('Top level HTML file %s is empty') % index)
# Encoding
if options.input_encoding:
@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin):
# Get the cover path from the OPF.
cover_path = None
opf = None
for x in walk('.'):
for x in top_levels:
if os.path.splitext(x)[1].lower() in ('.opf'):
opf = x
break