DOCX Input: Fix figures in newer Word documents being duplicated. Fixes #1789238 [textboxes duplcated after import from docx](https://bugs.launchpad.net/calibre/+bug/1789238)

Newer Word versions use a proprietary image markup with a fallback to
the standard form. Ignore the proprietary form and use only the
fallback, standard version when converting.
This commit is contained in:
Kovid Goyal 2018-08-30 08:53:27 +05:30
parent 03126d37f8
commit 30b1f442a6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 13 additions and 0 deletions

View File

@ -39,6 +39,7 @@ TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
# Text Content
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10': 'urn:schemas-microsoft-com:office:word',

View File

@ -120,6 +120,7 @@ class Convert(object):
self.log.debug('Converting Word markup to HTML')
self.read_page_properties(doc)
self.resolve_alternate_content(doc)
self.current_rels = relationships_by_id
for wp, page_properties in self.page_map.iteritems():
self.current_page = page_properties
@ -267,6 +268,17 @@ class Convert(object):
for x in current:
self.page_map[x] = pr
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
if fallbacks:
for choice in choices:
ac.remove(choice)
def read_styles(self, relationships_by_type):
def get_name(rtype, defname):