From 752bd9e06ea7df02296fd2c563ce1e31e8da46ff Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Jun 2013 11:03:15 +0530
Subject: [PATCH] DOCX Input: Detect likely cover image

DOCX Input: If a large image that looks like a cover is present at the
start of the document, remove it and use it as the cover of the output
ebook. This can be turned off under the DOC Input section of the
conversion dialog.
---
 .../ebooks/conversion/plugins/docx_input.py   | 10 ++++-
 src/calibre/ebooks/docx/cleanup.py            | 33 ++++++++++++++-
 src/calibre/ebooks/docx/to_html.py            |  7 +++-
 src/calibre/gui2/convert/docx_input.py        | 23 +++++++++++
 src/calibre/gui2/convert/docx_input.ui        | 41 +++++++++++++++++++
 5 files changed, 110 insertions(+), 4 deletions(-)
 create mode 100644 src/calibre/gui2/convert/docx_input.py
 create mode 100644 src/calibre/gui2/convert/docx_input.ui
diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py
index 7492d46c68..190a771379 100644
--- a/src/calibre/ebooks/conversion/plugins/docx_input.py
+++ b/src/calibre/ebooks/conversion/plugins/docx_input.py
@@ -14,9 +14,17 @@ class DOCXInput(InputFormatPlugin):
     description = 'Convert DOCX files (.docx) to HTML'
     file_types = set(['docx'])
 
+    options = {
+        OptionRecommendation(name='docx_no_cover', recommended_value=False,
+            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
+                   'it will be removed from the document and used as the cover for created ebook. This option '
+                   'turns off that behavior.')),
+
+    }
+
     recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])
 
     def convert(self, stream, options, file_ext, log, accelerators):
         from calibre.ebooks.docx.to_html import Convert
-        return Convert(stream, log=log)()
+        return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()
 
diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py
index 2b1e095025..a55f8449d8 100644
--- a/src/calibre/ebooks/docx/cleanup.py
+++ b/src/calibre/ebooks/docx/cleanup.py
@@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
+import os
 
 def mergeable(previous, current):
     if previous.tail or current.tail:
@@ -83,8 +84,19 @@ def lift(span):
         else:
             add_text(last_child, 'tail', span.tail)
 
+def before_count(root, tag, limit=10):
+    body = root.xpath('//body[1]')
+    if not body:
+        return limit
+    ans = 0
+    for elem in body[0].iterdescendants():
+        if elem is tag:
+            return ans
+        ans += 1
+        if ans > limit:
+            return limit
 
-def cleanup_markup(root, styles):
+def cleanup_markup(log, root, styles, dest_dir, detect_cover):
     # Merge consecutive spans that have the same styling
     current_run = []
     for span in root.xpath('//span'):
@@ -134,3 +146,22 @@ def cleanup_markup(root, styles):
     for span in root.xpath('//span[not(@class) and not(@id)]'):
         lift(span)
 
+    if detect_cover:
+        # Check if the first image in the document is possibly a cover
+        img = root.xpath('//img[@src][1]')
+        if img:
+            img = img[0]
+            path = os.path.join(dest_dir, img.get('src'))
+            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
+                from calibre.utils.magick.draw import identify
+                try:
+                    width, height, fmt = identify(path)
+                except:
+                    width, height, fmt = 0, 0, None
+                is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
+                if is_cover:
+                    log.debug('Detected an image that looks like a cover')
+                    img.getparent().remove(img)
+                    return path
+
+
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
index c3b2391d3f..963d1fc6c8 100644
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -40,11 +40,12 @@ class Text:
 
 class Convert(object):
 
-    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
         self.docx = DOCX(path_or_stream, log=log)
         self.ms_pat = re.compile(r'\s{2,}')
         self.ws_pat = re.compile(r'[\n\r\t]')
         self.log = self.docx.log
+        self.detect_cover = detect_cover
         self.notes_text = notes_text or _('Notes')
         self.dest_dir = dest_dir or os.getcwdu()
         self.mi = self.docx.metadata
@@ -169,7 +170,7 @@ class Convert(object):
                 break
 
         self.log.debug('Cleaning up redundant markup generated by Word')
-        cleanup_markup(self.html, self.styles)
+        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)
 
         return self.write(doc)
 
@@ -280,6 +281,8 @@ class Convert(object):
         opf.toc = toc
         opf.create_manifest_from_files_in([self.dest_dir])
         opf.create_spine(['index.html'])
+        if self.cover_image is not None:
+            opf.guide.set_cover(self.cover_image)
         with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx:
             opf.render(of, ncx, 'toc.ncx')
         return os.path.join(self.dest_dir, 'metadata.opf')
diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py
new file mode 100644
index 0000000000..46234c6a36
--- /dev/null
+++ b/src/calibre/gui2/convert/docx_input.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from calibre.gui2.convert.docx_input_ui import Ui_Form
+from calibre.gui2.convert import Widget
+
+class PluginWidget(Widget, Ui_Form):
+
+    TITLE = _('DOCX Input')
+    HELP = _('Options specific to')+' DOCX '+_('input')
+    COMMIT_NAME = 'docx_input'
+    ICON = I('mimetypes/docx.png')
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+            ['docx_no_cover', ])
+        self.initialize_options(get_option, get_help, db, book_id)
+
diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui
new file mode 100644
index 0000000000..41948118dc
--- /dev/null
+++ b/src/calibre/gui2/convert/docx_input.ui
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>518</width>
+    <height>353</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout_3">
+   <item>
+    <widget class="QCheckBox" name="opt_docx_no_cover">
+     <property name="text">
+      <string>Do not try to autodetect a &amp;cover from images in the document</string>
+     </property>
+    </widget>
+   </item>
+   <item>
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>213</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>