diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index 7492d46c68..190a771379 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -14,9 +14,17 @@ class DOCXInput(InputFormatPlugin): description = 'Convert DOCX files (.docx) to HTML' file_types = set(['docx']) + options = { + OptionRecommendation(name='docx_no_cover', recommended_value=False, + help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' + 'it will be removed from the document and used as the cover for created ebook. This option ' + 'turns off that behavior.')), + + } + recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert - return Convert(stream, log=log)() + return Convert(stream, detect_cover=not options.docx_no_cover, log=log)() diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 2b1e095025..a55f8449d8 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import os def mergeable(previous, current): if previous.tail or current.tail: @@ -83,8 +84,19 @@ def lift(span): else: add_text(last_child, 'tail', span.tail) +def before_count(root, tag, limit=10): + body = root.xpath('//body[1]') + if not body: + return limit + ans = 0 + for elem in body[0].iterdescendants(): + if elem is tag: + return ans + ans += 1 + if ans > limit: + return limit -def cleanup_markup(root, styles): +def cleanup_markup(log, root, styles, dest_dir, detect_cover): # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span'): @@ -134,3 +146,22 @@ def cleanup_markup(root, styles): for span in root.xpath('//span[not(@class) and not(@id)]'): lift(span) + if detect_cover: + # Check if the first image in the document is possibly a cover + img = root.xpath('//img[@src][1]') + if img: + img = img[0] + path = os.path.join(dest_dir, img.get('src')) + if os.path.exists(path) and before_count(root, img, limit=10) < 5: + from calibre.utils.magick.draw import identify + try: + width, height, fmt = identify(path) + except: + width, height, fmt = 0, 0, None + is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000 + if is_cover: + log.debug('Detected an image that looks like a cover') + img.getparent().remove(img) + return path + + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index c3b2391d3f..963d1fc6c8 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -40,11 +40,12 @@ class Text: class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None): + def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): self.docx = DOCX(path_or_stream, log=log) self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log + self.detect_cover = detect_cover self.notes_text = notes_text or _('Notes') self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata @@ -169,7 +170,7 @@ class Convert(object): break self.log.debug('Cleaning up redundant markup generated by Word') - cleanup_markup(self.html, self.styles) + self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) return self.write(doc) @@ -280,6 +281,8 @@ class Convert(object): opf.toc = toc opf.create_manifest_from_files_in([self.dest_dir]) opf.create_spine(['index.html']) + if self.cover_image is not None: + opf.guide.set_cover(self.cover_image) with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx: opf.render(of, ncx, 'toc.ncx') return os.path.join(self.dest_dir, 'metadata.opf') diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py new file mode 100644 index 0000000000..46234c6a36 --- /dev/null +++ b/src/calibre/gui2/convert/docx_input.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from calibre.gui2.convert.docx_input_ui import Ui_Form +from calibre.gui2.convert import Widget + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('DOCX Input') + HELP = _('Options specific to')+' DOCX '+_('input') + COMMIT_NAME = 'docx_input' + ICON = I('mimetypes/docx.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['docx_no_cover', ]) + self.initialize_options(get_option, get_help, db, book_id) + diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui new file mode 100644 index 0000000000..41948118dc --- /dev/null +++ b/src/calibre/gui2/convert/docx_input.ui @@ -0,0 +1,41 @@ + + + Form + + + + 0 + 0 + 518 + 353 + + + + Form + + + + + + Do not try to autodetect a &cover from images in the document + + + + + + + Qt::Vertical + + + + 20 + 213 + + + + + + + + +