diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py new file mode 100644 index 0000000000..4c268cb0f1 --- /dev/null +++ b/src/calibre/ebooks/docx/cleanup.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + + +def mergeable(previous, current): + if previous.tail or current.tail: + return False + if previous.get('class', None) != current.get('class', None): + return False + if current.get('id', False): + return False + try: + return next(previous.itersiblings()) is current + except StopIteration: + return False + + +def append_text(parent, text): + if len(parent) > 0: + parent[-1].tail = (parent[-1].tail or '') + text + else: + parent.text = (parent.text or '') + text + + +def merge(parent, span): + if span.text: + append_text(parent, span.text) + for child in span: + parent.append(child) + if span.tail: + append_text(parent, span.tail) + span.getparent().remove(span) + + +def merge_run(run): + parent = run[0] + for span in run[1:]: + merge(parent, span) + + +def cleanup_markup(root, styles): + # Merge consecutive spans that have the same styling + current_run = [] + for span in root.xpath('//span'): + if not current_run: + current_run.append(span) + else: + last = current_run[-1] + if mergeable(last, span): + current_run.append(span) + else: + if len(current_run) > 1: + merge_run(current_run) + current_run = [span] diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 7ee6e9e242..49d31abba5 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -23,6 +23,7 @@ from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.images import Images from calibre.ebooks.docx.tables import Tables from calibre.ebooks.docx.footnotes import Footnotes +from calibre.ebooks.docx.cleanup import cleanup_markup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.oeb.polish.toc import elem_to_toc_text @@ -157,6 +158,8 @@ class Convert(object): notes_header.set('class', '%s notes-header' % cls) break + cleanup_markup(self.html, self.styles) + return self.write() def read_page_properties(self, doc):