DOCX Input: Detect likely cover image

DOCX Input: If a large image that looks like a cover is present at the start of the document, remove it and use it as the cover of the output ebook. This can be turned off under the DOC Input section of the conversion dialog.
2025-06-23 15:30:45 -04:00 · 2013-06-17 11:03:15 +05:30 · 2013-06-17 11:03:15 +05:30 · 752bd9e06e
commit 752bd9e06e
parent 7a0675e59a
5 changed files with 110 additions and 4 deletions
--- a/src/calibre/ebooks/conversion/plugins/docx_input.py
+++ b/src/calibre/ebooks/conversion/plugins/docx_input.py
@ -14,9 +14,17 @@ class DOCXInput(InputFormatPlugin):
    description = 'Convert DOCX files (.docx) to HTML'
    file_types = set(['docx'])

+    options = {
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
+                   'it will be removed from the document and used as the cover for created ebook. This option '
+                   'turns off that behavior.')),
+
+    }
+
    recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.docx.to_html import Convert
-        return Convert(stream, log=log)()
+        return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()

--- a/src/calibre/ebooks/docx/cleanup.py
+++ b/src/calibre/ebooks/docx/cleanup.py
@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

+import os

 def mergeable(previous, current):
    if previous.tail or current.tail:
@ -83,8 +84,19 @@ def lift(span):
        else:
            add_text(last_child, 'tail', span.tail)

+def before_count(root, tag, limit=10):
+    body = root.xpath('//body[1]')
+    if not body:
+        return limit
+    ans = 0
+    for elem in body[0].iterdescendants():
+        if elem is tag:
+            return ans
+        ans += 1
+        if ans > limit:
+            return limit

-def cleanup_markup(root, styles):
+def cleanup_markup(log, root, styles, dest_dir, detect_cover):
    # Merge consecutive spans that have the same styling
    current_run = []
    for span in root.xpath('//span'):
@ -134,3 +146,22 @@ def cleanup_markup(root, styles):
    for span in root.xpath('//span[not(@class) and not(@id)]'):
        lift(span)

+    if detect_cover:
+        # Check if the first image in the document is possibly a cover
+        img = root.xpath('//img[@src][1]')
+        if img:
+            img = img[0]
+            path = os.path.join(dest_dir, img.get('src'))
+            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
+                from calibre.utils.magick.draw import identify
+                try:
+                    width, height, fmt = identify(path)
+                except:
+                    width, height, fmt = 0, 0, None
+                is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
+                if is_cover:
+                    log.debug('Detected an image that looks like a cover')
+                    img.getparent().remove(img)
+                    return path
+
+
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -40,11 +40,12 @@ class Text:

 class Convert(object):

-    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
+        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
@ -169,7 +170,7 @@ class Convert(object):
                break

        self.log.debug('Cleaning up redundant markup generated by Word')
-        cleanup_markup(self.html, self.styles)
+        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)

        return self.write(doc)

@ -280,6 +281,8 @@ class Convert(object):
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
+        if self.cover_image is not None:
+            opf.guide.set_cover(self.cover_image)
        with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')
--- a/src/calibre/gui2/convert/docx_input.py
+++ b/src/calibre/gui2/convert/docx_input.py
@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.gui2.convert.docx_input_ui import Ui_Form
+from calibre.gui2.convert import Widget
+
+class PluginWidget(Widget, Ui_Form):
+
+    TITLE = _('DOCX Input')
+    HELP = _('Options specific to')+' DOCX '+_('input')
+    COMMIT_NAME = 'docx_input'
+    ICON = I('mimetypes/docx.png')
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+            ['docx_no_cover', ])
+        self.initialize_options(get_option, get_help, db, book_id)
+
--- a/src/calibre/gui2/convert/docx_input.ui
+++ b/src/calibre/gui2/convert/docx_input.ui
@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>518</width>
+    <height>353</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout_3">
+   <item>
+    <widget class="QCheckBox" name="opt_docx_no_cover">
+     <property name="text">
+      <string>Do not try to autodetect a &amp;cover from images in the document</string>
+     </property>
+    </widget>
+   </item>
+   <item>
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>213</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>