DOCX Input: Add support for DOCX files created by Word 2013 in "Strict" mode

This commit is contained in:
Kovid Goyal 2015-04-10 16:14:09 +05:30
parent cf2aa25944
commit 6d58813c65
3 changed files with 37 additions and 20 deletions

View File

@ -28,7 +28,15 @@ def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
return None
pass
def twips(val, mult=0.05):
''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
if val and val.endswith('pt') and mult == 0.05:
return twips(val[:-2], mult=1.0)
LINE_STYLES = { # {{{
@ -160,7 +168,8 @@ def read_spacing(parent, dest, XPath, get):
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
if lh is not None:
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)

View File

@ -30,6 +30,11 @@ TRANSITIONAL_NAMES = {
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
}
STRICT_NAMES = {
k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument')
for k, v in TRANSITIONAL_NAMES.iteritems()
}
TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
@ -60,6 +65,14 @@ TRANSITIONAL_NAMESPACES = {
'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/'
}
STRICT_NAMESPACES = {
k:v.replace(
'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
for k, v in TRANSITIONAL_NAMESPACES.iteritems()
}
# }}}
def barename(x):
@ -83,6 +96,9 @@ class DOCXNamespace(object):
if transitional:
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
self.names = TRANSITIONAL_NAMES.copy()
else:
self.namespaces = STRICT_NAMESPACES.copy()
self.names = STRICT_NAMES.copy()
def XPath(self, expr):
ans = self.xpath_cache.get(expr, None)

View File

@ -9,7 +9,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap
from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
@ -21,29 +21,21 @@ class PageProperties(object):
'''
def __init__(self, namespace, elems=()):
self.width = self.height = 595.28, 841.89 # pts, A4
self.width, self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts
def setval(attr, val):
val = twips(val)
if val is not None:
setattr(self, attr, val)
for sectPr in elems:
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
try:
self.width = int(w)/20
except (ValueError, TypeError):
pass
try:
self.height = int(h)/20
except (ValueError, TypeError):
pass
setval('width', w), setval('height', h)
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
try:
self.margin_left = int(l)/20
except (ValueError, TypeError):
pass
try:
self.margin_right = int(r)/20
except (ValueError, TypeError):
pass
setval('margin_left', l), setval('margin_right', r)
class Style(object):