DOCX: Merge runs with the same styling to reduce amount of generated

markup
This commit is contained in:
Kovid Goyal 2013-06-12 12:43:22 +05:30
parent 0cc7a43ded
commit 83b0ef58fe
2 changed files with 62 additions and 0 deletions

View File

@ -0,0 +1,59 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def mergeable(previous, current):
if previous.tail or current.tail:
return False
if previous.get('class', None) != current.get('class', None):
return False
if current.get('id', False):
return False
try:
return next(previous.itersiblings()) is current
except StopIteration:
return False
def append_text(parent, text):
if len(parent) > 0:
parent[-1].tail = (parent[-1].tail or '') + text
else:
parent.text = (parent.text or '') + text
def merge(parent, span):
if span.text:
append_text(parent, span.text)
for child in span:
parent.append(child)
if span.tail:
append_text(parent, span.tail)
span.getparent().remove(span)
def merge_run(run):
parent = run[0]
for span in run[1:]:
merge(parent, span)
def cleanup_markup(root, styles):
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
if not current_run:
current_run.append(span)
else:
last = current_run[-1]
if mergeable(last, span):
current_run.append(span)
else:
if len(current_run) > 1:
merge_run(current_run)
current_run = [span]

View File

@ -23,6 +23,7 @@ from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -157,6 +158,8 @@ class Convert(object):
notes_header.set('class', '%s notes-header' % cls) notes_header.set('class', '%s notes-header' % cls)
break break
cleanup_markup(self.html, self.styles)
return self.write() return self.write()
def read_page_properties(self, doc): def read_page_properties(self, doc):