DOCX Input: Detect likely cover image

DOCX Input: If a large image that looks like a cover is present at the
start of the document, remove it and use it as the cover of the output
ebook. This can be turned off under the DOC Input section of the
conversion dialog.
This commit is contained in:
Kovid Goyal 2013-06-17 11:03:15 +05:30
parent 7a0675e59a
commit 752bd9e06e
5 changed files with 110 additions and 4 deletions

View File

@ -14,9 +14,17 @@ class DOCXInput(InputFormatPlugin):
description = 'Convert DOCX files (.docx) to HTML'
file_types = set(['docx'])
options = {
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
'it will be removed from the document and used as the cover for created ebook. This option '
'turns off that behavior.')),
}
recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, log=log)()
return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()

View File

@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
def mergeable(previous, current):
if previous.tail or current.tail:
@ -83,8 +84,19 @@ def lift(span):
else:
add_text(last_child, 'tail', span.tail)
def before_count(root, tag, limit=10):
body = root.xpath('//body[1]')
if not body:
return limit
ans = 0
for elem in body[0].iterdescendants():
if elem is tag:
return ans
ans += 1
if ans > limit:
return limit
def cleanup_markup(root, styles):
def cleanup_markup(log, root, styles, dest_dir, detect_cover):
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
@ -134,3 +146,22 @@ def cleanup_markup(root, styles):
for span in root.xpath('//span[not(@class) and not(@id)]'):
lift(span)
if detect_cover:
# Check if the first image in the document is possibly a cover
img = root.xpath('//img[@src][1]')
if img:
img = img[0]
path = os.path.join(dest_dir, img.get('src'))
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
from calibre.utils.magick.draw import identify
try:
width, height, fmt = identify(path)
except:
width, height, fmt = 0, 0, None
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
if is_cover:
log.debug('Detected an image that looks like a cover')
img.getparent().remove(img)
return path

View File

@ -40,11 +40,12 @@ class Text:
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
self.docx = DOCX(path_or_stream, log=log)
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.detect_cover = detect_cover
self.notes_text = notes_text or _('Notes')
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
@ -169,7 +170,7 @@ class Convert(object):
break
self.log.debug('Cleaning up redundant markup generated by Word')
cleanup_markup(self.html, self.styles)
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)
return self.write(doc)
@ -280,6 +281,8 @@ class Convert(object):
opf.toc = toc
opf.create_manifest_from_files_in([self.dest_dir])
opf.create_spine(['index.html'])
if self.cover_image is not None:
opf.guide.set_cover(self.cover_image)
with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx')
return os.path.join(self.dest_dir, 'metadata.opf')

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.gui2.convert.docx_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
TITLE = _('DOCX Input')
HELP = _('Options specific to')+' DOCX '+_('input')
COMMIT_NAME = 'docx_input'
ICON = I('mimetypes/docx.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['docx_no_cover', ])
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>518</width>
<height>353</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout_3">
<item>
<widget class="QCheckBox" name="opt_docx_no_cover">
<property name="text">
<string>Do not try to autodetect a &amp;cover from images in the document</string>
</property>
</widget>
</item>
<item>
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>