mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTMLZ Input: Do a better job of detecting index HTML file in archive. Warn user if multiple HTML files are found. Ensure index file is not empty. Only load top level index file.
This commit is contained in:
parent
68778be31d
commit
ed7459e2ce
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre import guess_type, walk
|
from calibre import guess_type
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin):
|
|||||||
accelerators):
|
accelerators):
|
||||||
self.log = log
|
self.log = log
|
||||||
html = u''
|
html = u''
|
||||||
|
top_levels = []
|
||||||
|
|
||||||
# Extract content from zip archive.
|
# Extract content from zip archive.
|
||||||
zf = ZipFile(stream)
|
zf = ZipFile(stream)
|
||||||
zf.extractall()
|
zf.extractall()
|
||||||
|
|
||||||
for x in walk('.'):
|
# Find the HTML file in the archive. It needs to be
|
||||||
|
# top level.
|
||||||
|
index = u''
|
||||||
|
multiple_html = False
|
||||||
|
# Get a list of all top level files in the archive.
|
||||||
|
for x in os.listdir('.'):
|
||||||
|
if os.path.isfile(x):
|
||||||
|
top_levels.append(x)
|
||||||
|
# Try to find an index. file.
|
||||||
|
for x in top_levels:
|
||||||
|
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
|
||||||
|
index = x
|
||||||
|
break
|
||||||
|
# Look for multiple HTML files in the archive. We look at the
|
||||||
|
# top level files only as only they matter in HTMLZ.
|
||||||
|
for x in top_levels:
|
||||||
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
|
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
|
||||||
with open(x, 'rb') as tf:
|
# Set index to the first HTML file found if it's not
|
||||||
html = tf.read()
|
# called index.
|
||||||
break
|
if not index:
|
||||||
|
index = x
|
||||||
|
else:
|
||||||
|
multiple_html = True
|
||||||
|
# Warn the user if there multiple HTML file in the archive. HTMLZ
|
||||||
|
# supports a single HTML file. A conversion with a multiple HTML file
|
||||||
|
# HTMLZ archive probably won't turn out as the user expects. With
|
||||||
|
# Multiple HTML files ZIP input should be used in place of HTMLZ.
|
||||||
|
if multiple_html:
|
||||||
|
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
|
||||||
|
|
||||||
|
if index:
|
||||||
|
with open(index, 'rb') as tf:
|
||||||
|
html = tf.read()
|
||||||
|
else:
|
||||||
|
raise Exception(_('No top level HTML file found.'))
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
raise Exception(_('Top level HTML file %s is empty') % index)
|
||||||
|
|
||||||
# Encoding
|
# Encoding
|
||||||
if options.input_encoding:
|
if options.input_encoding:
|
||||||
@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin):
|
|||||||
# Get the cover path from the OPF.
|
# Get the cover path from the OPF.
|
||||||
cover_path = None
|
cover_path = None
|
||||||
opf = None
|
opf = None
|
||||||
for x in walk('.'):
|
for x in top_levels:
|
||||||
if os.path.splitext(x)[1].lower() in ('.opf'):
|
if os.path.splitext(x)[1].lower() in ('.opf'):
|
||||||
opf = x
|
opf = x
|
||||||
break
|
break
|
||||||
|
Loading…
x
Reference in New Issue
Block a user