mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
...
This commit is contained in:
commit
9d041c7969
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre import guess_type, walk
|
from calibre import guess_type
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
@ -25,16 +25,50 @@ class HTMLZInput(InputFormatPlugin):
|
|||||||
accelerators):
|
accelerators):
|
||||||
self.log = log
|
self.log = log
|
||||||
html = u''
|
html = u''
|
||||||
|
top_levels = []
|
||||||
|
|
||||||
# Extract content from zip archive.
|
# Extract content from zip archive.
|
||||||
zf = ZipFile(stream)
|
zf = ZipFile(stream)
|
||||||
zf.extractall()
|
zf.extractall()
|
||||||
|
|
||||||
for x in walk('.'):
|
# Find the HTML file in the archive. It needs to be
|
||||||
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
|
# top level.
|
||||||
with open(x, 'rb') as tf:
|
index = u''
|
||||||
html = tf.read()
|
multiple_html = False
|
||||||
|
# Get a list of all top level files in the archive.
|
||||||
|
for x in os.listdir('.'):
|
||||||
|
if os.path.isfile(x):
|
||||||
|
top_levels.append(x)
|
||||||
|
# Try to find an index. file.
|
||||||
|
for x in top_levels:
|
||||||
|
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
|
||||||
|
index = x
|
||||||
break
|
break
|
||||||
|
# Look for multiple HTML files in the archive. We look at the
|
||||||
|
# top level files only as only they matter in HTMLZ.
|
||||||
|
for x in top_levels:
|
||||||
|
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
|
||||||
|
# Set index to the first HTML file found if it's not
|
||||||
|
# called index.
|
||||||
|
if not index:
|
||||||
|
index = x
|
||||||
|
else:
|
||||||
|
multiple_html = True
|
||||||
|
# Warn the user if there multiple HTML file in the archive. HTMLZ
|
||||||
|
# supports a single HTML file. A conversion with a multiple HTML file
|
||||||
|
# HTMLZ archive probably won't turn out as the user expects. With
|
||||||
|
# Multiple HTML files ZIP input should be used in place of HTMLZ.
|
||||||
|
if multiple_html:
|
||||||
|
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
|
||||||
|
|
||||||
|
if index:
|
||||||
|
with open(index, 'rb') as tf:
|
||||||
|
html = tf.read()
|
||||||
|
else:
|
||||||
|
raise Exception(_('No top level HTML file found.'))
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
raise Exception(_('Top level HTML file %s is empty') % index)
|
||||||
|
|
||||||
# Encoding
|
# Encoding
|
||||||
if options.input_encoding:
|
if options.input_encoding:
|
||||||
@ -75,7 +109,7 @@ class HTMLZInput(InputFormatPlugin):
|
|||||||
# Get the cover path from the OPF.
|
# Get the cover path from the OPF.
|
||||||
cover_path = None
|
cover_path = None
|
||||||
opf = None
|
opf = None
|
||||||
for x in walk('.'):
|
for x in top_levels:
|
||||||
if os.path.splitext(x)[1].lower() in ('.opf'):
|
if os.path.splitext(x)[1].lower() in ('.opf'):
|
||||||
opf = x
|
opf = x
|
||||||
break
|
break
|
||||||
|
Loading…
x
Reference in New Issue
Block a user