From 6ec37ff715039c71883a7d79450770317e5edbaf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 20 Apr 2009 11:47:33 -0700 Subject: [PATCH] Add support for zip, rar and oebzip archives as input formats --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 54 ++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 6d5401278a..fd99e1e346 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -133,7 +133,7 @@ def add_pipeline_options(parser, plumber): [ 'level1_toc', 'level2_toc', 'level3_toc', 'toc_threshold', 'max_toc_links', 'no_chapters_in_toc', - 'use_auto_toc', + 'use_auto_toc', 'toc_filter', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3a2d39c314..2b78aca822 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -3,13 +3,21 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, re from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.ptempfile import PersistentTemporaryDirectory +from calibre import extract, walk + +def supported_input_formats(): + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('zip', 'rar', 'oebzip'): + fmts.add(x) + return fmts class OptionValues(object): pass @@ -279,11 +287,14 @@ OptionRecommendation(name='language', help=_('Set the language.')), ] - input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() + if input_fmt in ('zip', 'rar', 'oebzip'): + self.log('Processing archive...') + tdir = PersistentTemporaryDirectory('_plumber') + self.input, input_fmt = self.unarchive(self.input, tdir) if os.path.exists(self.output) and os.path.isdir(self.output): output_fmt = 'oeb' @@ -293,7 +304,7 @@ OptionRecommendation(name='language', output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() - self.input_plugin = plugin_for_input_format(input_fmt) + self.input_plugin = plugin_for_input_format(input_fmt) self.output_plugin = plugin_for_output_format(output_fmt) if self.input_plugin is None: @@ -316,6 +327,43 @@ OptionRecommendation(name='language', # plugins. self.merge_plugin_recommendations() + @classmethod + def unarchive(self, path, tdir): + extract(path, tdir) + files = list(walk(tdir)) + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('htm', 'html', 'xhtm', 'xhtml'): fmts.remove(x) + + for ext in fmts: + for f in files: + if f.lower().endswith('.'+ext): + if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: + continue + return f, ext + return self.find_html_index(files) + + @classmethod + def find_html_index(self, files): + ''' + Given a list of files, find the most likely root HTML file in the + list. + ''' + html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) + html_files = [f for f in files if html_pat.search(f) is not None] + if not html_files: + raise ValueError(_('Could not find an ebook inside the archive')) + html_files = [(f, os.stat(f).st_size) for f in html_files] + html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) + html_files = [f[0] for f in html_files] + for q in ('toc', 'index'): + for f in html_files: + if os.path.splitext(os.path.basename(f))[0].lower() == q: + return f, os.path.splitext(f)[1].lower()[1:] + return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] + + + def get_option_by_name(self, name): for group in (self.input_options, self.pipeline_options, self.output_options):