From 6d58813c654646d709fde0f467df07161b7c82c8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Apr 2015 16:14:09 +0530 Subject: [PATCH] DOCX Input: Add support for DOCX files created by Word 2013 in "Strict" mode --- src/calibre/ebooks/docx/block_styles.py | 13 ++++++++++-- src/calibre/ebooks/docx/names.py | 16 ++++++++++++++ src/calibre/ebooks/docx/styles.py | 28 +++++++++---------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/docx/block_styles.py b/src/calibre/ebooks/docx/block_styles.py index 103fb6c30f..c7b99336e7 100644 --- a/src/calibre/ebooks/docx/block_styles.py +++ b/src/calibre/ebooks/docx/block_styles.py @@ -28,7 +28,15 @@ def simple_float(val, mult=1.0): try: return float(val) * mult except (ValueError, TypeError, AttributeError, KeyError): - return None + pass + +def twips(val, mult=0.05): + ''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.''' + try: + return float(val) * mult + except (ValueError, TypeError, AttributeError, KeyError): + if val and val.endswith('pt') and mult == 0.05: + return twips(val[:-2], mult=1.0) LINE_STYLES = { # {{{ @@ -160,7 +168,8 @@ def read_spacing(parent, dest, XPath, get): l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') if l is not None: lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0) - line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '') + if lh is not None: + line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '') setattr(dest, 'margin_top', padding_top) setattr(dest, 'margin_bottom', padding_bottom) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index afa74f1f55..6f9c4247c0 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -30,6 +30,11 @@ TRANSITIONAL_NAMES = { 'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings', } +STRICT_NAMES = { + k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument') + for k, v in TRANSITIONAL_NAMES.iteritems() +} + TRANSITIONAL_NAMESPACES = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'o': 'urn:schemas-microsoft-com:office:office', @@ -60,6 +65,14 @@ TRANSITIONAL_NAMESPACES = { 'dcmitype': 'http://purl.org/dc/dcmitype/', 'dcterms': 'http://purl.org/dc/terms/' } + +STRICT_NAMESPACES = { + k:v.replace( + 'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace( + 'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace( + 'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml') + for k, v in TRANSITIONAL_NAMESPACES.iteritems() +} # }}} def barename(x): @@ -83,6 +96,9 @@ class DOCXNamespace(object): if transitional: self.namespaces = TRANSITIONAL_NAMESPACES.copy() self.names = TRANSITIONAL_NAMES.copy() + else: + self.namespaces = STRICT_NAMESPACES.copy() + self.names = STRICT_NAMES.copy() def XPath(self, expr): ans = self.xpath_cache.get(expr, None) diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index ba1900aafd..7b36a29325 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -9,7 +9,7 @@ __copyright__ = '2013, Kovid Goyal ' import textwrap from collections import OrderedDict, Counter -from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit +from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.tables import TableStyle @@ -21,29 +21,21 @@ class PageProperties(object): ''' def __init__(self, namespace, elems=()): - self.width = self.height = 595.28, 841.89 # pts, A4 + self.width, self.height = 595.28, 841.89 # pts, A4 self.margin_left = self.margin_right = 72 # pts + + def setval(attr, val): + val = twips(val) + if val is not None: + setattr(self, attr, val) + for sectPr in elems: for pgSz in namespace.XPath('./w:pgSz')(sectPr): w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h') - try: - self.width = int(w)/20 - except (ValueError, TypeError): - pass - try: - self.height = int(h)/20 - except (ValueError, TypeError): - pass + setval('width', w), setval('height', h) for pgMar in namespace.XPath('./w:pgMar')(sectPr): l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right') - try: - self.margin_left = int(l)/20 - except (ValueError, TypeError): - pass - try: - self.margin_right = int(r)/20 - except (ValueError, TypeError): - pass + setval('margin_left', l), setval('margin_right', r) class Style(object):