From 752bd9e06ea7df02296fd2c563ce1e31e8da46ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 17 Jun 2013 11:03:15 +0530 Subject: [PATCH] DOCX Input: Detect likely cover image DOCX Input: If a large image that looks like a cover is present at the start of the document, remove it and use it as the cover of the output ebook. This can be turned off under the DOC Input section of the conversion dialog. --- .../ebooks/conversion/plugins/docx_input.py | 10 ++++- src/calibre/ebooks/docx/cleanup.py | 33 ++++++++++++++- src/calibre/ebooks/docx/to_html.py | 7 +++- src/calibre/gui2/convert/docx_input.py | 23 +++++++++++ src/calibre/gui2/convert/docx_input.ui | 41 +++++++++++++++++++ 5 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 src/calibre/gui2/convert/docx_input.py create mode 100644 src/calibre/gui2/convert/docx_input.ui diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index 7492d46c68..190a771379 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -14,9 +14,17 @@ class DOCXInput(InputFormatPlugin): description = 'Convert DOCX files (.docx) to HTML' file_types = set(['docx']) + options = { + OptionRecommendation(name='docx_no_cover', recommended_value=False, + help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' + 'it will be removed from the document and used as the cover for created ebook. This option ' + 'turns off that behavior.')), + + } + recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert - return Convert(stream, log=log)() + return Convert(stream, detect_cover=not options.docx_no_cover, log=log)() diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 2b1e095025..a55f8449d8 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import os def mergeable(previous, current): if previous.tail or current.tail: @@ -83,8 +84,19 @@ def lift(span): else: add_text(last_child, 'tail', span.tail) +def before_count(root, tag, limit=10): + body = root.xpath('//body[1]') + if not body: + return limit + ans = 0 + for elem in body[0].iterdescendants(): + if elem is tag: + return ans + ans += 1 + if ans > limit: + return limit -def cleanup_markup(root, styles): +def cleanup_markup(log, root, styles, dest_dir, detect_cover): # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span'): @@ -134,3 +146,22 @@ def cleanup_markup(root, styles): for span in root.xpath('//span[not(@class) and not(@id)]'): lift(span) + if detect_cover: + # Check if the first image in the document is possibly a cover + img = root.xpath('//img[@src][1]') + if img: + img = img[0] + path = os.path.join(dest_dir, img.get('src')) + if os.path.exists(path) and before_count(root, img, limit=10) < 5: + from calibre.utils.magick.draw import identify + try: + width, height, fmt = identify(path) + except: + width, height, fmt = 0, 0, None + is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000 + if is_cover: + log.debug('Detected an image that looks like a cover') + img.getparent().remove(img) + return path + + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index c3b2391d3f..963d1fc6c8 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -40,11 +40,12 @@ class Text: class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None): + def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): self.docx = DOCX(path_or_stream, log=log) self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log + self.detect_cover = detect_cover self.notes_text = notes_text or _('Notes') self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata @@ -169,7 +170,7 @@ class Convert(object): break self.log.debug('Cleaning up redundant markup generated by Word') - cleanup_markup(self.html, self.styles) + self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) return self.write(doc) @@ -280,6 +281,8 @@ class Convert(object): opf.toc = toc opf.create_manifest_from_files_in([self.dest_dir]) opf.create_spine(['index.html']) + if self.cover_image is not None: + opf.guide.set_cover(self.cover_image) with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx: opf.render(of, ncx, 'toc.ncx') return os.path.join(self.dest_dir, 'metadata.opf') diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py new file mode 100644 index 0000000000..46234c6a36 --- /dev/null +++ b/src/calibre/gui2/convert/docx_input.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from calibre.gui2.convert.docx_input_ui import Ui_Form +from calibre.gui2.convert import Widget + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('DOCX Input') + HELP = _('Options specific to')+' DOCX '+_('input') + COMMIT_NAME = 'docx_input' + ICON = I('mimetypes/docx.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['docx_no_cover', ]) + self.initialize_options(get_option, get_help, db, book_id) + diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui new file mode 100644 index 0000000000..41948118dc --- /dev/null +++ b/src/calibre/gui2/convert/docx_input.ui @@ -0,0 +1,41 @@ + + + Form + + + + 0 + 0 + 518 + 353 + + + + Form + + + + + + Do not try to autodetect a &cover from images in the document + + + + + + + Qt::Vertical + + + + 20 + 213 + + + + + + + + +