DOCX: Refactor to support different namespace profiles

Needed to support the "Strict" mode DOCX files that Word 2013 can
optionally generate.
This commit is contained in:
Kovid Goyal 2015-04-10 13:39:24 +05:30
parent 341f011372
commit b9f86450a0
24 changed files with 441 additions and 413 deletions

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict from collections import OrderedDict
from calibre.ebooks.docx.names import XPath, get
class Inherit: class Inherit:
pass pass
inherit = Inherit() inherit = Inherit()
def binary_property(parent, name): def binary_property(parent, name, XPath, get):
vals = XPath('./w:%s' % name)(parent) vals = XPath('./w:%s' % name)(parent)
if not vals: if not vals:
return inherit return inherit
@ -68,7 +67,7 @@ LINE_STYLES = { # {{{
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color') border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
def read_single_border(parent, edge): def read_single_border(parent, edge, XPath, get):
color = style = width = padding = None color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent): for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color') c = get(elem, 'w:color')
@ -95,19 +94,19 @@ def read_single_border(parent, edge):
width = 3 # WebKit needs 3pts to render double borders width = 3 # WebKit needs 3pts to render double borders
return {p:v for p, v in zip(border_props, (padding, width, style, color))} return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'): def read_border(parent, dest, XPath, get, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props} vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent): for border in XPath('./w:' + name)(parent):
for edge in border_edges: for edge in border_edges:
for prop, val in read_single_border(border, edge).iteritems(): for prop, val in read_single_border(border, edge, XPath, get).iteritems():
if val is not None: if val is not None:
vals[prop % edge] = val vals[prop % edge] = val
for key, val in vals.iteritems(): for key, val in vals.iteritems():
setattr(dest, key, val) setattr(dest, key, val)
def read_indent(parent, dest): def read_indent(parent, dest, XPath, get):
padding_left = padding_right = text_indent = inherit padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent): for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
@ -133,7 +132,7 @@ def read_indent(parent, dest):
setattr(dest, 'margin_right', padding_right) setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent) setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest): def read_justification(parent, dest, XPath, get):
ans = inherit ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent): for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val') val = get(jc, 'w:val')
@ -145,7 +144,7 @@ def read_justification(parent, dest):
ans = val ans = val
setattr(dest, 'text_align', ans) setattr(dest, 'text_align', ans)
def read_spacing(parent, dest): def read_spacing(parent, dest, XPath, get):
padding_top = padding_bottom = line_height = inherit padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent): for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing') a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
@ -167,7 +166,7 @@ def read_spacing(parent, dest):
setattr(dest, 'margin_bottom', padding_bottom) setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height) setattr(dest, 'line_height', line_height)
def read_direction(parent, dest): def read_direction(parent, dest, XPath, get):
ans = inherit ans = inherit
for jc in XPath('./w:textFlow[@w:val]')(parent): for jc in XPath('./w:textFlow[@w:val]')(parent):
val = get(jc, 'w:val') val = get(jc, 'w:val')
@ -177,7 +176,7 @@ def read_direction(parent, dest):
ans = 'rtl' ans = 'rtl'
setattr(dest, 'direction', ans) setattr(dest, 'direction', ans)
def read_shd(parent, dest): def read_shd(parent, dest, XPath, get):
ans = inherit ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent): for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill') val = get(shd, 'w:fill')
@ -185,7 +184,7 @@ def read_shd(parent, dest):
ans = simple_color(val, auto='transparent') ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans) setattr(dest, 'background_color', ans)
def read_numbering(parent, dest): def read_numbering(parent, dest, XPath, get):
lvl = num_id = None lvl = num_id = None
for np in XPath('./w:numPr')(parent): for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np): for ilvl in XPath('./w:ilvl[@w:val]')(np):
@ -203,7 +202,7 @@ class Frame(object):
all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap', all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y') 'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
def __init__(self, fp): def __init__(self, fp, XPath, get):
self.drop_cap = get(fp, 'w:dropCap', 'none') self.drop_cap = get(fp, 'w:dropCap', 'none')
try: try:
self.h = int(get(fp, 'w:h'))/20 self.h = int(get(fp, 'w:h'))/20
@ -275,10 +274,10 @@ class Frame(object):
def __ne__(self, other): def __ne__(self, other):
return not self.__eq__(other) return not self.__eq__(other)
def read_frame(parent, dest): def read_frame(parent, dest, XPath, get):
ans = inherit ans = inherit
for fp in XPath('./w:framePr')(parent): for fp in XPath('./w:framePr')(parent):
ans = Frame(fp) ans = Frame(fp, XPath, get)
setattr(dest, 'frame', ans) setattr(dest, 'frame', ans)
# }}} # }}}
@ -303,7 +302,8 @@ class ParagraphStyle(object):
'numbering', 'font_family', 'font_size', 'color', 'frame', 'numbering', 'font_family', 'font_size', 'color', 'frame',
) )
def __init__(self, pPr=None): def __init__(self, namespace, pPr=None):
self.namespace = namespace
self.linked_style = None self.linked_style = None
if pPr is None: if pPr is None:
for p in self.all_properties: for p in self.all_properties:
@ -315,14 +315,14 @@ class ParagraphStyle(object):
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
): ):
setattr(self, p, binary_property(pPr, p)) setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering', 'frame'): for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering', 'frame'):
f = globals()['read_%s' % x] f = globals()['read_%s' % x]
f(pPr, self) f(pPr, self, namespace.XPath, namespace.get)
for s in XPath('./w:pStyle[@w:val]')(pPr): for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = get(s, 'w:val') self.linked_style = namespace.get(s, 'w:val')
self.font_family = self.font_size = self.color = inherit self.font_family = self.font_size = self.color = inherit

View File

@ -9,10 +9,9 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict from collections import OrderedDict
from calibre.ebooks.docx.block_styles import ( # noqa from calibre.ebooks.docx.block_styles import ( # noqa
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd) inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
from calibre.ebooks.docx.names import XPath, get
# Read from XML {{{ # Read from XML {{{
def read_text_border(parent, dest): def read_text_border(parent, dest, XPath, get):
border_color = border_style = border_width = padding = inherit border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent) elems = XPath('./w:bdr')(parent)
if elems and elems[0].attrib: if elems and elems[0].attrib:
@ -46,7 +45,7 @@ def read_text_border(parent, dest):
setattr(dest, 'border_width', border_width) setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding) setattr(dest, 'padding', padding)
def read_color(parent, dest): def read_color(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:color[@w:val]')(parent): for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val') val = get(col, 'w:val')
@ -61,7 +60,7 @@ def convert_highlight_color(val):
'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000', 'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
'lightGray': '#c0c0c0'}.get(val, val) 'lightGray': '#c0c0c0'}.get(val, val)
def read_highlight(parent, dest): def read_highlight(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent): for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val') val = get(col, 'w:val')
@ -74,7 +73,7 @@ def read_highlight(parent, dest):
ans = val ans = val
setattr(dest, 'highlight', ans) setattr(dest, 'highlight', ans)
def read_lang(parent, dest): def read_lang(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:lang[@w:val]')(parent): for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val') val = get(col, 'w:val')
@ -91,7 +90,7 @@ def read_lang(parent, dest):
ans = val ans = val
setattr(dest, 'lang', ans) setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest): def read_letter_spacing(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent): for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05) val = simple_float(get(col, 'w:val'), 0.05)
@ -99,7 +98,7 @@ def read_letter_spacing(parent, dest):
ans = val ans = val
setattr(dest, 'letter_spacing', ans) setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest): def read_sz(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:sz[@w:val]')(parent): for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5) val = simple_float(get(col, 'w:val'), 0.5)
@ -107,7 +106,7 @@ def read_sz(parent, dest):
ans = val ans = val
setattr(dest, 'font_size', ans) setattr(dest, 'font_size', ans)
def read_underline(parent, dest): def read_underline(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:u[@w:val]')(parent): for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val') val = get(col, 'w:val')
@ -115,7 +114,7 @@ def read_underline(parent, dest):
ans = val if val == 'none' else 'underline' ans = val if val == 'none' else 'underline'
setattr(dest, 'text_decoration', ans) setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest): def read_vert_align(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent): for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val') val = get(col, 'w:val')
@ -123,7 +122,7 @@ def read_vert_align(parent, dest):
ans = val ans = val
setattr(dest, 'vert_align', ans) setattr(dest, 'vert_align', ans)
def read_font_family(parent, dest): def read_font_family(parent, dest, XPath, get):
ans = inherit ans = inherit
for col in XPath('./w:rFonts')(parent): for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:asciiTheme') val = get(col, 'w:asciiTheme')
@ -150,7 +149,8 @@ class RunStyle(object):
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'dstrike', 'vanish', 'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'dstrike', 'vanish',
} }
def __init__(self, rPr=None): def __init__(self, namespace, rPr=None):
self.namespace = namespace
self.linked_style = None self.linked_style = None
if rPr is None: if rPr is None:
for p in self.all_properties: for p in self.all_properties:
@ -160,14 +160,14 @@ class RunStyle(object):
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish', 'webHidden', 'smallCaps', 'strike', 'vanish', 'webHidden',
): ):
setattr(self, p, binary_property(rPr, p)) setattr(self, p, binary_property(rPr, p, namespace.XPath, namespace.get))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'): for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'):
f = globals()['read_%s' % x] f = globals()['read_%s' % x]
f(rPr, self) f(rPr, self, namespace.XPath, namespace.get)
for s in XPath('./w:rStyle[@w:val]')(rPr): for s in namespace.XPath('./w:rStyle[@w:val]')(rPr):
self.linked_style = get(s, 'w:val') self.linked_style = namespace.get(s, 'w:val')
self._css = None self._css = None

View File

@ -8,7 +8,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os import os
from calibre.ebooks.docx.names import XPath
NBSP = '\xa0' NBSP = '\xa0'
def mergeable(previous, current): def mergeable(previous, current):
@ -99,7 +98,7 @@ def before_count(root, tag, limit=10):
if ans > limit: if ans > limit:
return limit return limit
def cleanup_markup(log, root, styles, dest_dir, detect_cover): def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
# Move <hr>s outside paragraphs, if possible. # Move <hr>s outside paragraphs, if possible.
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
for hr in root.xpath('//span/hr'): for hr in root.xpath('//span/hr'):

View File

@ -14,7 +14,7 @@ from calibre import walk, guess_type
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.docx import InvalidDOCX from calibre.ebooks.docx import InvalidDOCX
from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
@ -25,7 +25,7 @@ def fromstring(raw, parser=RECOVER_PARSER):
return etree.fromstring(raw, parser=parser) return etree.fromstring(raw, parser=parser)
# Read metadata {{{ # Read metadata {{{
def read_doc_props(raw, mi): def read_doc_props(raw, mi, XPath):
root = fromstring(raw) root = fromstring(raw)
titles = XPath('//dc:title')(root) titles = XPath('//dc:title')(root)
if titles: if titles:
@ -72,7 +72,7 @@ def read_app_props(raw, mi):
if company and company[0].text and company[0].text.strip(): if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip() mi.publisher = company[0].text.strip()
def read_default_style_language(raw, mi): def read_default_style_language(raw, mi, XPath):
root = fromstring(raw) root = fromstring(raw)
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root): for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
lang = canonicalize_lang(lang) lang = canonicalize_lang(lang)
@ -84,6 +84,7 @@ def read_default_style_language(raw, mi):
class DOCX(object): class DOCX(object):
def __init__(self, path_or_stream, log=None, extract=True): def __init__(self, path_or_stream, log=None, extract=True):
self.docx_is_transitional = True
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>' self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log self.log = log or default_log
@ -93,6 +94,7 @@ class DOCX(object):
self.init_zipfile(stream) self.init_zipfile(stream)
self.read_content_types() self.read_content_types()
self.read_package_relationships() self.read_package_relationships()
self.namespace = DOCXNamespace(self.docx_is_transitional)
def init_zipfile(self, stream): def init_zipfile(self, stream):
self.zipf = ZipFile(stream) self.zipf = ZipFile(stream)
@ -158,12 +160,14 @@ class DOCX(object):
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target').lstrip('/') target = item.get('Target').lstrip('/')
typ = item.get('Type') typ = item.get('Type')
if target == 'word/document.xml':
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
self.relationships[typ] = target self.relationships[typ] = target
self.relationships_rmap[target] = typ self.relationships_rmap[target] = typ
@property @property
def document_name(self): def document_name(self):
name = self.relationships.get(DOCUMENT, None) name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
if not names: if not names:
@ -201,13 +205,13 @@ class DOCX(object):
return by_id, by_type return by_id, by_type
def get_document_properties_names(self): def get_document_properties_names(self):
name = self.relationships.get(DOCPROPS, None) name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names: if names:
name = names[0] name = names[0]
yield name yield name
name = self.relationships.get(APPPROPS, None) name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names: if names:
@ -224,16 +228,16 @@ class DOCX(object):
except KeyError: except KeyError:
pass pass
else: else:
read_doc_props(raw, mi) read_doc_props(raw, mi, self.namespace.XPath)
if mi.is_null('language'): if mi.is_null('language'):
try: try:
raw = self.read('word/styles.xml') raw = self.read('word/styles.xml')
except KeyError: except KeyError:
pass pass
else: else:
read_default_style_language(raw, mi) read_default_style_language(raw, mi, self.namespace.XPath)
ap_name = self.relationships.get(APPPROPS, None) ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if ap_name: if ap_name:
try: try:
raw = self.read(ap_name) raw = self.read(ap_name)

View File

@ -9,7 +9,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from calibre.ebooks.docx.index import process_index, polish_index_markup from calibre.ebooks.docx.index import process_index, polish_index_markup
from calibre.ebooks.docx.names import XPath, get, namespaces
class Field(object): class Field(object):
@ -48,9 +47,6 @@ scanner = re.Scanner([
null = object() null = object()
def WORD(x):
return '{%s}%s' % (namespaces['w'], x)
def parser(name, field_map, default_field_name=None): def parser(name, field_map, default_field_name=None):
field_map = dict((x.split(':') for x in field_map.split())) field_map = dict((x.split(':') for x in field_map.split()))
@ -98,22 +94,23 @@ parse_noteref = parser('noteref',
class Fields(object): class Fields(object):
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.fields = [] self.fields = []
self.index_bookmark_counter = 0 self.index_bookmark_counter = 0
self.index_bookmark_prefix = 'index-' self.index_bookmark_prefix = 'index-'
def __call__(self, doc, log): def __call__(self, doc, log):
all_ids = frozenset(XPath('//*/@w:id')(doc)) all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
c = 0 c = 0
while self.index_bookmark_prefix in all_ids: while self.index_bookmark_prefix in all_ids:
c += 1 c += 1
self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c) self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
stack = [] stack = []
for elem in XPath( for elem in self.namespace.XPath(
'//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc): '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
if elem.tag.endswith('}fldChar'): if elem.tag.endswith('}fldChar'):
typ = get(elem, 'w:fldCharType') typ = self.namespace.get(elem, 'w:fldCharType')
if typ == 'begin': if typ == 'begin':
stack.append(Field(elem)) stack.append(Field(elem))
self.fields.append(stack[-1]) self.fields.append(stack[-1])
@ -193,6 +190,8 @@ class Fields(object):
if xe: if xe:
# We insert a synthetic bookmark around this index item so that we # We insert a synthetic bookmark around this index item so that we
# can link to it later # can link to it later
def WORD(x):
return self.namespace.expand('w:' + x)
self.index_bookmark_counter += 1 self.index_bookmark_counter += 1
bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter) bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
p = field.start.getparent() p = field.start.getparent()
@ -210,7 +209,7 @@ class Fields(object):
if not field.contents: if not field.contents:
return return
idx = parse_func(field.instructions, log) idx = parse_func(field.instructions, log)
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log) hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
if not blocks: if not blocks:
return return
for anchor, run in hyperlinks: for anchor, run in hyperlinks:

View File

@ -10,7 +10,6 @@ import os, re
from collections import namedtuple from collections import namedtuple
from calibre.ebooks.docx.block_styles import binary_property, inherit from calibre.ebooks.docx.block_styles import binary_property, inherit
from calibre.ebooks.docx.names import XPath, get
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.fonts.scanner import font_scanner, NoFonts from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
@ -29,7 +28,7 @@ def get_variant(bold=False, italic=False):
class Family(object): class Family(object):
def __init__(self, elem, embed_relationships): def __init__(self, elem, embed_relationships, XPath, get):
self.name = self.family_name = get(elem, 'w:name') self.name = self.family_name = get(elem, 'w:name')
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem)) self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
if self.alt_names and not has_system_fonts(self.name): if self.alt_names and not has_system_fonts(self.name):
@ -51,7 +50,7 @@ class Family(object):
for x in XPath('./w:family[@w:val]')(elem): for x in XPath('./w:family[@w:val]')(elem):
self.generic_family = get(x, 'w:val', 'auto') self.generic_family = get(x, 'w:val', 'auto')
ntt = binary_property(elem, 'notTrueType') ntt = binary_property(elem, 'notTrueType', XPath, get)
self.is_ttf = ntt is inherit or not ntt self.is_ttf = ntt is inherit or not ntt
self.panose1 = None self.panose1 = None
@ -73,13 +72,14 @@ class Family(object):
class Fonts(object): class Fonts(object):
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.fonts = {} self.fonts = {}
self.used = set() self.used = set()
def __call__(self, root, embed_relationships, docx, dest_dir): def __call__(self, root, embed_relationships, docx, dest_dir):
for elem in XPath('//w:font[@w:name]')(root): for elem in self.namespace.XPath('//w:font[@w:name]')(root):
self.fonts[get(elem, 'w:name')] = Family(elem, embed_relationships) self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
def family_for(self, name, bold=False, italic=False): def family_for(self, name, bold=False, italic=False):
f = self.fonts.get(name, None) f = self.fonts.get(name, None)

View File

@ -8,42 +8,43 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict from collections import OrderedDict
from calibre.ebooks.docx.names import get, XPath, descendants
class Note(object): class Note(object):
def __init__(self, parent, rels): def __init__(self, namespace, parent, rels):
self.type = get(parent, 'w:type', 'normal') self.type = namespace.get(parent, 'w:type', 'normal')
self.parent = parent self.parent = parent
self.rels = rels self.rels = rels
self.namespace = namespace
def __iter__(self): def __iter__(self):
for p in descendants(self.parent, 'w:p', 'w:tbl'): for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
yield p yield p
class Footnotes(object): class Footnotes(object):
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.footnotes = {} self.footnotes = {}
self.endnotes = {} self.endnotes = {}
self.counter = 0 self.counter = 0
self.notes = OrderedDict() self.notes = OrderedDict()
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels): def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
XPath, get = self.namespace.XPath, self.namespace.get
if footnotes is not None: if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes): for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id') fid = get(footnote, 'w:id')
if fid: if fid:
self.footnotes[fid] = Note(footnote, footnotes_rels) self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
if endnotes is not None: if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes): for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id') fid = get(endnote, 'w:id')
if fid: if fid:
self.endnotes[fid] = Note(endnote, endnotes_rels) self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
def get_ref(self, ref): def get_ref(self, ref):
fid = get(ref, 'w:id') fid = self.namespace.get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None) note = notes.get(fid, None)
if note is not None and note.type == 'normal': if note is not None and note.type == 'normal':

View File

@ -11,7 +11,7 @@ import os
from lxml.html.builder import IMG, HR from lxml.html.builder import IMG, HR
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.ebooks.docx.names import XPath, get, barename from calibre.ebooks.docx.names import barename
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
@ -27,7 +27,7 @@ def emu_to_pt(x):
def pt_to_emu(x): def pt_to_emu(x):
return int(x * 12700) return int(x * 12700)
def get_image_properties(parent): def get_image_properties(parent, XPath, get):
width = height = None width = height = None
for extent in XPath('./wp:extent')(parent): for extent in XPath('./wp:extent')(parent):
try: try:
@ -67,7 +67,7 @@ def get_image_margins(elem):
ans['padding-%s' % css] = '%.3gpt' % val ans['padding-%s' % css] = '%.3gpt' % val
return ans return ans
def get_hpos(anchor, page_width): def get_hpos(anchor, page_width, XPath, get):
for ph in XPath('./wp:positionH')(anchor): for ph in XPath('./wp:positionH')(anchor):
rp = ph.get('relativeFrom', None) rp = ph.get('relativeFrom', None)
if rp == 'leftMargin': if rp == 'leftMargin':
@ -101,7 +101,8 @@ def get_hpos(anchor, page_width):
class Images(object): class Images(object):
def __init__(self, log): def __init__(self, namespace, log):
self.namespace = namespace
self.rid_map = {} self.rid_map = {}
self.used = {} self.used = {}
self.names = set() self.names = set()
@ -158,6 +159,7 @@ class Images(object):
return name return name
def pic_to_img(self, pic, alt, parent): def pic_to_img(self, pic, alt, parent):
XPath, get = self.namespace.XPath, self.namespace.get
name = None name = None
link = None link = None
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
@ -191,9 +193,10 @@ class Images(object):
return img return img
def drawing_to_html(self, drawing, page): def drawing_to_html(self, drawing, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First process the inline pictures # First process the inline pictures
for inline in XPath('./wp:inline')(drawing): for inline in XPath('./wp:inline')(drawing):
style, alt = get_image_properties(inline) style, alt = get_image_properties(inline, XPath, get)
for pic in XPath('descendant::pic:pic')(inline): for pic in XPath('descendant::pic:pic')(inline):
ans = self.pic_to_img(pic, alt, inline) ans = self.pic_to_img(pic, alt, inline)
if ans is not None: if ans is not None:
@ -203,7 +206,7 @@ class Images(object):
# Now process the floats # Now process the floats
for anchor in XPath('./wp:anchor')(drawing): for anchor in XPath('./wp:anchor')(drawing):
style, alt = get_image_properties(anchor) style, alt = get_image_properties(anchor, XPath, get)
self.get_float_properties(anchor, style, page) self.get_float_properties(anchor, style, page)
for pic in XPath('descendant::pic:pic')(anchor): for pic in XPath('descendant::pic:pic')(anchor):
ans = self.pic_to_img(pic, alt, anchor) ans = self.pic_to_img(pic, alt, anchor)
@ -213,6 +216,7 @@ class Images(object):
yield ans yield ans
def pict_to_html(self, pict, page): def pict_to_html(self, pict, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First see if we have an <hr> # First see if we have an <hr>
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'} is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
if is_hr: if is_hr:
@ -247,6 +251,7 @@ class Images(object):
yield img yield img
def get_float_properties(self, anchor, style, page): def get_float_properties(self, anchor, style, page):
XPath, get = self.namespace.XPath, self.namespace.get
if 'display' not in style: if 'display' not in style:
style['display'] = 'block' style['display'] = 'block'
padding = get_image_margins(anchor) padding = get_image_margins(anchor)
@ -257,7 +262,7 @@ class Images(object):
# Ignore margins # Ignore margins
page_width = page.width page_width = page.width
hpos = get_hpos(anchor, page_width) + width/(2*page_width) hpos = get_hpos(anchor, page_width, XPath, get) + width/(2*page_width)
wrap_elem = None wrap_elem = None
dofloat = False dofloat = False

View File

@ -10,10 +10,9 @@ from operator import itemgetter
from lxml import etree from lxml import etree
from calibre.ebooks.docx.names import XPath, expand
from calibre.utils.icu import partition_by_first_letter, sort_key from calibre.utils.icu import partition_by_first_letter, sort_key
def get_applicable_xe_fields(index, xe_fields): def get_applicable_xe_fields(index, xe_fields, XPath, expand):
iet = index.get('entry-type', None) iet = index.get('entry-type', None)
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet] xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
@ -40,7 +39,7 @@ def get_applicable_xe_fields(index, xe_fields):
return [xe for xe in xe_fields if contained(xe)] return [xe for xe in xe_fields if contained(xe)]
def make_block(style, parent, pos): def make_block(expand, style, parent, pos):
p = parent.makeelement(expand('w:p')) p = parent.makeelement(expand('w:p'))
parent.insert(pos, p) parent.insert(pos, p)
if style is not None: if style is not None:
@ -56,7 +55,7 @@ def make_block(style, parent, pos):
r.append(t) r.append(t)
return p, t return p, t
def add_xe(xe, t): def add_xe(xe, t, expand):
text = xe.get('text', '') text = xe.get('text', '')
pt = xe.get('page-number-text', None) pt = xe.get('page-number-text', None)
t.text = text or ' ' t.text = text or ' '
@ -70,7 +69,7 @@ def add_xe(xe, t):
r.append(t2) r.append(t2)
return xe['anchor'], t.getparent() return xe['anchor'], t.getparent()
def process_index(field, index, xe_fields, log): def process_index(field, index, xe_fields, log, XPath, expand):
''' '''
We remove all the word generated index markup and replace it with our own We remove all the word generated index markup and replace it with our own
that is more suitable for an ebook. that is more suitable for an ebook.
@ -89,7 +88,7 @@ def process_index(field, index, xe_fields, log):
start_pos = (p, p.index(elem)) start_pos = (p, p.index(elem))
p.remove(elem) p.remove(elem)
xe_fields = get_applicable_xe_fields(index, xe_fields) xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
if not xe_fields: if not xe_fields:
return [], [] return [], []
if heading_text is not None: if heading_text is not None:
@ -107,14 +106,14 @@ def process_index(field, index, xe_fields, log):
for item in reversed(items): for item in reversed(items):
is_heading = not isinstance(item, dict) is_heading = not isinstance(item, dict)
style = heading_style if is_heading else None style = heading_style if is_heading else None
p, t = make_block(style, *start_pos) p, t = make_block(expand, style, *start_pos)
if is_heading: if is_heading:
text = heading_text text = heading_text
if text.lower().startswith('a'): if text.lower().startswith('a'):
text = item + text[1:] text = item + text[1:]
t.text = text t.text = text
else: else:
hyperlinks.append(add_xe(item, t)) hyperlinks.append(add_xe(item, t, expand))
blocks.append(p) blocks.append(p)
return hyperlinks, blocks return hyperlinks, blocks

View File

@ -12,22 +12,25 @@ from lxml.etree import XPath as X
from calibre.utils.filenames import ascii_text from calibre.utils.filenames import ascii_text
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' # Names {{{
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' TRANSITIONAL_NAMES = {
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' 'DOCUMENT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' 'DOCPROPS' : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' 'APPPROPS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' 'STYLES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
EMBEDDED_FONT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font' 'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' 'FONTS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink' 'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes' 'IMAGES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes' 'LINKS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme' 'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings' 'ENDNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings' 'THEMES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
'SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
}
namespaces = { TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office', 'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006', 've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
@ -57,40 +60,13 @@ namespaces = {
'dcmitype': 'http://purl.org/dc/dcmitype/', 'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/' 'dcterms': 'http://purl.org/dc/terms/'
} }
# }}}
xpath_cache = {}
def XPath(expr):
ans = xpath_cache.get(expr, None)
if ans is None:
xpath_cache[expr] = ans = X(expr, namespaces=namespaces)
return ans
def is_tag(x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (namespaces.get(ns, None), name) == tag
def barename(x): def barename(x):
return x.rpartition('}')[-1] return x.rpartition('}')[-1]
def XML(x): def XML(x):
return '{%s}%s' % (namespaces['xml'], x) return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
def expand(name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (namespaces[ns], tag)
return tag or ns
def get(x, attr, default=None):
return x.attrib.get(expand(attr), default)
def ancestor(elem, name):
try:
return XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def generate_anchor(name, existing): def generate_anchor(name, existing):
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_') x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
@ -100,14 +76,48 @@ def generate_anchor(name, existing):
c += 1 c += 1
return y return y
def children(elem, *args): class DOCXNamespace(object):
return XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(elem, *args): def __init__(self, transitional=True):
return XPath('|'.join('descendant::%s' % a for a in args))(elem) self.xpath_cache = {}
if transitional:
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
self.names = TRANSITIONAL_NAMES.copy()
def makeelement(root, tag, append=True, **attrs): def XPath(self, expr):
ans = root.makeelement(expand(tag), **{expand(k, sep='_'):v for k, v in attrs.iteritems()}) ans = self.xpath_cache.get(expr, None)
if append: if ans is None:
root.append(ans) self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
return ans return ans
def is_tag(self, x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
def expand(self, name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (self.namespaces[ns], tag)
return tag or ns
def get(self, x, attr, default=None):
return x.attrib.get(self.expand(attr), default)
def ancestor(self, elem, name):
try:
return self.XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def children(self, elem, *args):
return self.XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(self, elem, *args):
return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
def makeelement(self, root, tag, append=True, **attrs):
ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in attrs.iteritems()})
if append:
root.append(ans)
return ans

View File

@ -13,7 +13,6 @@ from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle, inherit from calibre.ebooks.docx.char_styles import RunStyle, inherit
from calibre.ebooks.docx.names import XPath, get
STYLE_MAP = { STYLE_MAP = {
'aiueo': 'hiragana', 'aiueo': 'hiragana',
@ -32,7 +31,8 @@ STYLE_MAP = {
class Level(object): class Level(object):
def __init__(self, lvl=None): def __init__(self, namespace, lvl=None):
self.namespace = namespace
self.restart = None self.restart = None
self.start = 0 self.start = 0
self.fmt = 'decimal' self.fmt = 'decimal'
@ -47,7 +47,7 @@ class Level(object):
self.read_from_xml(lvl) self.read_from_xml(lvl)
def copy(self): def copy(self):
ans = Level() ans = Level(self.namespace)
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'): for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
setattr(ans, x, getattr(self, x)) setattr(ans, x, getattr(self, x))
return ans return ans
@ -61,6 +61,7 @@ class Level(object):
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0' return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False): def read_from_xml(self, lvl, override=False):
XPath, get = self.namespace.XPath, self.namespace.get
for lr in XPath('./w:lvlRestart[@w:val]')(lvl): for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try: try:
self.restart = int(get(lr, 'w:val')) self.restart = int(get(lr, 'w:val'))
@ -74,7 +75,7 @@ class Level(object):
pass pass
for rPr in XPath('./w:rPr')(lvl): for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(rPr) ps = RunStyle(self.namespace, rPr)
if self.character_style is None: if self.character_style is None:
self.character_style = ps self.character_style = ps
else: else:
@ -106,7 +107,7 @@ class Level(object):
self.para_link = get(lr, 'w:val') self.para_link = get(lr, 'w:val')
for pPr in XPath('./w:pPr')(lvl): for pPr in XPath('./w:pPr')(lvl):
ps = ParagraphStyle(pPr) ps = ParagraphStyle(self.namespace, pPr)
if self.paragraph_style is None: if self.paragraph_style is None:
self.paragraph_style = ps self.paragraph_style = ps
else: else:
@ -135,7 +136,9 @@ class Level(object):
class NumberingDefinition(object): class NumberingDefinition(object):
def __init__(self, parent=None, an_id=None): def __init__(self, namespace, parent=None, an_id=None):
self.namespace = namespace
XPath, get = self.namespace.XPath, self.namespace.get
self.levels = {} self.levels = {}
self.abstract_numbering_definition_id = an_id self.abstract_numbering_definition_id = an_id
if parent is not None: if parent is not None:
@ -144,17 +147,18 @@ class NumberingDefinition(object):
ilvl = int(get(lvl, 'w:ilvl', 0)) ilvl = int(get(lvl, 'w:ilvl', 0))
except (TypeError, ValueError): except (TypeError, ValueError):
ilvl = 0 ilvl = 0
self.levels[ilvl] = Level(lvl) self.levels[ilvl] = Level(namespace, lvl)
def copy(self): def copy(self):
ans = NumberingDefinition(an_id=self.abstract_numbering_definition_id) ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
for l, lvl in self.levels.iteritems(): for l, lvl in self.levels.iteritems():
ans.levels[l] = lvl.copy() ans.levels[l] = lvl.copy()
return ans return ans
class Numbering(object): class Numbering(object):
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.definitions = {} self.definitions = {}
self.instances = {} self.instances = {}
self.counters = defaultdict(Counter) self.counters = defaultdict(Counter)
@ -163,6 +167,7 @@ class Numbering(object):
def __call__(self, root, styles, rid_map): def __call__(self, root, styles, rid_map):
' Read all numbering style definitions ' ' Read all numbering style definitions '
XPath, get = self.namespace.XPath, self.namespace.get
self.rid_map = rid_map self.rid_map = rid_map
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root): for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
npbid = get(npb, 'w:numPicBulletId') npbid = get(npb, 'w:numPicBulletId')
@ -176,7 +181,7 @@ class Numbering(object):
if nsl: if nsl:
lazy_load[an_id] = get(nsl[0], 'w:val') lazy_load[an_id] = get(nsl[0], 'w:val')
else: else:
nd = NumberingDefinition(an, an_id=an_id) nd = NumberingDefinition(self.namespace, an, an_id=an_id)
self.definitions[an_id] = nd self.definitions[an_id] = nd
def create_instance(n, definition): def create_instance(n, definition):
@ -199,7 +204,7 @@ class Numbering(object):
ilvl = nilvl if ilvl is None else ilvl ilvl = nilvl if ilvl is None else ilvl
alvl = nd.levels.get(ilvl, None) alvl = nd.levels.get(ilvl, None)
if alvl is None: if alvl is None:
alvl = Level() alvl = Level(self.namespace)
alvl.read_from_xml(lvl, override=True) alvl.read_from_xml(lvl, override=True)
for ilvl, so in start_overrides.iteritems(): for ilvl, so in start_overrides.iteritems():
try: try:

View File

@ -6,17 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath, get
class Settings(object): class Settings(object):
def __init__(self): def __init__(self, namespace):
self.default_tab_stop = 720 / 20 self.default_tab_stop = 720 / 20
self.namespace = namespace
def __call__(self, root): def __call__(self, root):
for dts in XPath('//w:defaultTabStop[@w:val]')(root): for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
try: try:
self.default_tab_stop = int(get(dts, 'w:val')) / 20 self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
except (ValueError, TypeError, AttributeError): except (ValueError, TypeError, AttributeError):
pass pass

View File

@ -12,7 +12,6 @@ from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle from calibre.ebooks.docx.tables import TableStyle
from calibre.ebooks.docx.names import XPath, get
class PageProperties(object): class PageProperties(object):
@ -21,12 +20,12 @@ class PageProperties(object):
sectPr elements. sectPr elements.
''' '''
def __init__(self, elems=()): def __init__(self, namespace, elems=()):
self.width = self.height = 595.28, 841.89 # pts, A4 self.width = self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts self.margin_left = self.margin_right = 72 # pts
for sectPr in elems: for sectPr in elems:
for pgSz in XPath('./w:pgSz')(sectPr): for pgSz in namespace.XPath('./w:pgSz')(sectPr):
w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h') w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
try: try:
self.width = int(w)/20 self.width = int(w)/20
except (ValueError, TypeError): except (ValueError, TypeError):
@ -35,8 +34,8 @@ class PageProperties(object):
self.height = int(h)/20 self.height = int(h)/20
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
for pgMar in XPath('./w:pgMar')(sectPr): for pgMar in namespace.XPath('./w:pgMar')(sectPr):
l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right') l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
try: try:
self.margin_left = int(l)/20 self.margin_left = int(l)/20
except (ValueError, TypeError): except (ValueError, TypeError):
@ -52,41 +51,41 @@ class Style(object):
Class representing a <w:style> element. Can contain block, character, etc. styles. Class representing a <w:style> element. Can contain block, character, etc. styles.
''' '''
name_path = XPath('./w:name[@w:val]') def __init__(self, namespace, elem):
based_on_path = XPath('./w:basedOn[@w:val]') self.namespace = namespace
self.name_path = namespace.XPath('./w:name[@w:val]')
def __init__(self, elem): self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
self.resolved = False self.resolved = False
self.style_id = get(elem, 'w:styleId') self.style_id = namespace.get(elem, 'w:styleId')
self.style_type = get(elem, 'w:type') self.style_type = namespace.get(elem, 'w:type')
names = self.name_path(elem) names = self.name_path(elem)
self.name = get(names[-1], 'w:val') if names else None self.name = namespace.get(names[-1], 'w:val') if names else None
based_on = self.based_on_path(elem) based_on = self.based_on_path(elem)
self.based_on = get(based_on[0], 'w:val') if based_on else None self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering': if self.style_type == 'numbering':
self.based_on = None self.based_on = None
self.is_default = get(elem, 'w:default') in {'1', 'on', 'true'} self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = self.table_style = None self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character', 'table'}: if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table': if self.style_type == 'table':
for tblPr in XPath('./w:tblPr')(elem): for tblPr in namespace.XPath('./w:tblPr')(elem):
ts = TableStyle(tblPr) ts = TableStyle(namespace, tblPr)
if self.table_style is None: if self.table_style is None:
self.table_style = ts self.table_style = ts
else: else:
self.table_style.update(ts) self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}: if self.style_type in {'paragraph', 'table'}:
for pPr in XPath('./w:pPr')(elem): for pPr in namespace.XPath('./w:pPr')(elem):
ps = ParagraphStyle(pPr) ps = ParagraphStyle(namespace, pPr)
if self.paragraph_style is None: if self.paragraph_style is None:
self.paragraph_style = ps self.paragraph_style = ps
else: else:
self.paragraph_style.update(ps) self.paragraph_style.update(ps)
for rPr in XPath('./w:rPr')(elem): for rPr in namespace.XPath('./w:rPr')(elem):
rs = RunStyle(rPr) rs = RunStyle(namespace, rPr)
if self.character_style is None: if self.character_style is None:
self.character_style = rs self.character_style = rs
else: else:
@ -94,21 +93,21 @@ class Style(object):
if self.style_type in {'numbering', 'paragraph'}: if self.style_type in {'numbering', 'paragraph'}:
self.numbering_style_link = None self.numbering_style_link = None
for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem): for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = get(x, 'w:val') self.numbering_style_link = namespace.get(x, 'w:val')
def resolve_based_on(self, parent): def resolve_based_on(self, parent):
if parent.table_style is not None: if parent.table_style is not None:
if self.table_style is None: if self.table_style is None:
self.table_style = TableStyle() self.table_style = TableStyle(self.namespace)
self.table_style.resolve_based_on(parent.table_style) self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None: if parent.paragraph_style is not None:
if self.paragraph_style is None: if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle() self.paragraph_style = ParagraphStyle(self.namespace)
self.paragraph_style.resolve_based_on(parent.paragraph_style) self.paragraph_style.resolve_based_on(parent.paragraph_style)
if parent.character_style is not None: if parent.character_style is not None:
if self.character_style is None: if self.character_style is None:
self.character_style = RunStyle() self.character_style = RunStyle(self.namespace)
self.character_style.resolve_based_on(parent.character_style) self.character_style.resolve_based_on(parent.character_style)
@ -118,7 +117,8 @@ class Styles(object):
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup. Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
''' '''
def __init__(self, tables): def __init__(self, namespace, tables):
self.namespace = namespace
self.id_map = OrderedDict() self.id_map = OrderedDict()
self.para_cache = {} self.para_cache = {}
self.para_char_cache = {} self.para_char_cache = {}
@ -144,8 +144,8 @@ class Styles(object):
def __call__(self, root, fonts, theme): def __call__(self, root, fonts, theme):
self.fonts, self.theme = fonts, theme self.fonts, self.theme = fonts, theme
for s in XPath('//w:style')(root): for s in self.namespace.XPath('//w:style')(root):
s = Style(s) s = Style(self.namespace, s)
if s.style_id: if s.style_id:
self.id_map[s.style_id] = s self.id_map[s.style_id] = s
if s.is_default: if s.is_default:
@ -155,17 +155,17 @@ class Styles(object):
self.default_paragraph_style = self.default_character_style = None self.default_paragraph_style = self.default_character_style = None
for dd in XPath('./w:docDefaults')(root): for dd in self.namespace.XPath('./w:docDefaults')(root):
for pd in XPath('./w:pPrDefault')(dd): for pd in self.namespace.XPath('./w:pPrDefault')(dd):
for pPr in XPath('./w:pPr')(pd): for pPr in self.namespace.XPath('./w:pPr')(pd):
ps = ParagraphStyle(pPr) ps = ParagraphStyle(self.namespace, pPr)
if self.default_paragraph_style is None: if self.default_paragraph_style is None:
self.default_paragraph_style = ps self.default_paragraph_style = ps
else: else:
self.default_paragraph_style.update(ps) self.default_paragraph_style.update(ps)
for pd in XPath('./w:rPrDefault')(dd): for pd in self.namespace.XPath('./w:rPrDefault')(dd):
for pPr in XPath('./w:rPr')(pd): for pPr in self.namespace.XPath('./w:rPr')(pd):
ps = RunStyle(pPr) ps = RunStyle(self.namespace, pPr)
if self.default_character_style is None: if self.default_character_style is None:
self.default_character_style = ps self.default_character_style = ps
else: else:
@ -213,18 +213,18 @@ class Styles(object):
ans = self.para_cache.get(p, None) ans = self.para_cache.get(p, None)
if ans is None: if ans is None:
linked_style = None linked_style = None
ans = self.para_cache[p] = ParagraphStyle() ans = self.para_cache[p] = ParagraphStyle(self.namespace)
ans.style_name = None ans.style_name = None
direct_formatting = None direct_formatting = None
for pPr in XPath('./w:pPr')(p): for pPr in self.namespace.XPath('./w:pPr')(p):
ps = ParagraphStyle(pPr) ps = ParagraphStyle(self.namespace, pPr)
if direct_formatting is None: if direct_formatting is None:
direct_formatting = ps direct_formatting = ps
else: else:
direct_formatting.update(ps) direct_formatting.update(ps)
if direct_formatting is None: if direct_formatting is None:
direct_formatting = ParagraphStyle() direct_formatting = ParagraphStyle(self.namespace)
parent_styles = [] parent_styles = []
if self.default_paragraph_style is not None: if self.default_paragraph_style is not None:
parent_styles.append(self.default_paragraph_style) parent_styles.append(self.default_paragraph_style)
@ -275,19 +275,19 @@ class Styles(object):
def resolve_run(self, r): def resolve_run(self, r):
ans = self.run_cache.get(r, None) ans = self.run_cache.get(r, None)
if ans is None: if ans is None:
p = XPath('ancestor::w:p[1]')(r) p = self.namespace.XPath('ancestor::w:p[1]')(r)
p = p[0] if p else None p = p[0] if p else None
ans = self.run_cache[r] = RunStyle() ans = self.run_cache[r] = RunStyle(self.namespace)
direct_formatting = None direct_formatting = None
for rPr in XPath('./w:rPr')(r): for rPr in self.namespace.XPath('./w:rPr')(r):
rs = RunStyle(rPr) rs = RunStyle(self.namespace, rPr)
if direct_formatting is None: if direct_formatting is None:
direct_formatting = rs direct_formatting = rs
else: else:
direct_formatting.update(rs) direct_formatting.update(rs)
if direct_formatting is None: if direct_formatting is None:
direct_formatting = RunStyle() direct_formatting = RunStyle(self.namespace)
parent_styles = [] parent_styles = []
default_char = self.default_styles.get('character', None) default_char = self.default_styles.get('character', None)
@ -484,5 +484,3 @@ class Styles(object):
b = '\n'.join(b) b = '\n'.join(b)
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';'))) ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return prefix + '\n' + '\n'.join(ans) return prefix + '\n' + '\n'.join(ans)

View File

@ -10,13 +10,12 @@ from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get, is_tag
# Read from XML {{{ # Read from XML {{{
read_shd = rs read_shd = rs
edges = ('left', 'top', 'right', 'bottom') edges = ('left', 'top', 'right', 'bottom')
def _read_width(elem): def _read_width(elem, get):
ans = inherit ans = inherit
try: try:
w = int(get(elem, 'w:w')) w = int(get(elem, 'w:w'))
@ -33,29 +32,29 @@ def _read_width(elem):
ans = '%.3g%%' % (w/50) ans = '%.3g%%' % (w/50)
return ans return ans
def read_width(parent, dest): def read_width(parent, dest, XPath, get):
ans = inherit ans = inherit
for tblW in XPath('./w:tblW')(parent): for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW) ans = _read_width(tblW, get)
setattr(dest, 'width', ans) setattr(dest, 'width', ans)
def read_cell_width(parent, dest): def read_cell_width(parent, dest, XPath, get):
ans = inherit ans = inherit
for tblW in XPath('./w:tcW')(parent): for tblW in XPath('./w:tcW')(parent):
ans = _read_width(tblW) ans = _read_width(tblW, get)
setattr(dest, 'width', ans) setattr(dest, 'width', ans)
def read_padding(parent, dest): def read_padding(parent, dest, XPath, get):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar' name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
ans = {x:inherit for x in edges} ans = {x:inherit for x in edges}
for mar in XPath('./w:%s' % name)(parent): for mar in XPath('./w:%s' % name)(parent):
for x in edges: for x in edges:
for edge in XPath('./w:%s' % x)(mar): for edge in XPath('./w:%s' % x)(mar):
ans[x] = _read_width(edge) ans[x] = _read_width(edge, get)
for x in edges: for x in edges:
setattr(dest, 'cell_padding_%s' % x, ans[x]) setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_justification(parent, dest): def read_justification(parent, dest, XPath, get):
left = right = inherit left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent): for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val') val = get(jc, 'w:val')
@ -70,31 +69,31 @@ def read_justification(parent, dest):
setattr(dest, 'margin_left', left) setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right) setattr(dest, 'margin_right', right)
def read_spacing(parent, dest): def read_spacing(parent, dest, XPath, get):
ans = inherit ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent): for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs) ans = _read_width(cs, get)
setattr(dest, 'spacing', ans) setattr(dest, 'spacing', ans)
def read_float(parent, dest): def read_float(parent, dest, XPath, get):
ans = inherit ans = inherit
for x in XPath('./w:tblpPr')(parent): for x in XPath('./w:tblpPr')(parent):
ans = {k.rpartition('}')[-1]: v for k, v in x.attrib.iteritems()} ans = {k.rpartition('}')[-1]: v for k, v in x.attrib.iteritems()}
setattr(dest, 'float', ans) setattr(dest, 'float', ans)
def read_indent(parent, dest): def read_indent(parent, dest, XPath, get):
ans = inherit ans = inherit
for cs in XPath('./w:tblInd')(parent): for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs) ans = _read_width(cs, get)
setattr(dest, 'indent', ans) setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV') border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest): def read_borders(parent, dest, XPath, get):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders' name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, border_edges, name) read_border(parent, dest, XPath, get, border_edges, name)
def read_height(parent, dest): def read_height(parent, dest, XPath, get):
ans = inherit ans = inherit
for rh in XPath('./w:trHeight')(parent): for rh in XPath('./w:trHeight')(parent):
rule = get(rh, 'w:hRule', 'auto') rule = get(rh, 'w:hRule', 'auto')
@ -103,14 +102,14 @@ def read_height(parent, dest):
ans = (rule, val) ans = (rule, val)
setattr(dest, 'height', ans) setattr(dest, 'height', ans)
def read_vertical_align(parent, dest): def read_vertical_align(parent, dest, XPath, get):
ans = inherit ans = inherit
for va in XPath('./w:vAlign')(parent): for va in XPath('./w:vAlign')(parent):
val = get(va, 'w:val') val = get(va, 'w:val')
ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle') ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
setattr(dest, 'vertical_align', ans) setattr(dest, 'vertical_align', ans)
def read_col_span(parent, dest): def read_col_span(parent, dest, XPath, get):
ans = inherit ans = inherit
for gs in XPath('./w:gridSpan')(parent): for gs in XPath('./w:gridSpan')(parent):
try: try:
@ -119,14 +118,14 @@ def read_col_span(parent, dest):
continue continue
setattr(dest, 'col_span', ans) setattr(dest, 'col_span', ans)
def read_merge(parent, dest): def read_merge(parent, dest, XPath, get):
for x in ('hMerge', 'vMerge'): for x in ('hMerge', 'vMerge'):
ans = inherit ans = inherit
for m in XPath('./w:%s' % x)(parent): for m in XPath('./w:%s' % x)(parent):
ans = get(m, 'w:val', 'continue') ans = get(m, 'w:val', 'continue')
setattr(dest, x, ans) setattr(dest, x, ans)
def read_band_size(parent, dest): def read_band_size(parent, dest, XPath, get):
for x in ('Col', 'Row'): for x in ('Col', 'Row'):
ans = 1 ans = 1
for y in XPath('./w:tblStyle%sBandSize' % x)(parent): for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
@ -136,7 +135,7 @@ def read_band_size(parent, dest):
continue continue
setattr(dest, '%s_band_size' % x.lower(), ans) setattr(dest, '%s_band_size' % x.lower(), ans)
def read_look(parent, dest): def read_look(parent, dest, XPath, get):
ans = 0 ans = 0
for x in XPath('./w:tblLook')(parent): for x in XPath('./w:tblLook')(parent):
try: try:
@ -148,8 +147,10 @@ def read_look(parent, dest):
# }}} # }}}
def clone(style): def clone(style):
if style is None:
return None
try: try:
ans = type(style)() ans = type(style)(style.namespace)
except TypeError: except TypeError:
return None return None
ans.update(style) ans.update(style)
@ -190,16 +191,17 @@ class RowStyle(Style):
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',) all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
def __init__(self, trPr=None): def __init__(self, namespace, trPr=None):
self.namespace = namespace
if trPr is None: if trPr is None:
for p in self.all_properties: for p in self.all_properties:
setattr(self, p, inherit) setattr(self, p, inherit)
else: else:
for p in ('hidden', 'cantSplit'): for p in ('hidden', 'cantSplit'):
setattr(self, p, binary_property(trPr, p)) setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
for p in ('spacing', 'height'): for p in ('spacing', 'height'):
f = globals()['read_%s' % p] f = globals()['read_%s' % p]
f(trPr, self) f(trPr, self, namespace.XPath, namespace.get)
self._css = None self._css = None
@property @property
@ -226,14 +228,15 @@ class CellStyle(Style):
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span', 'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
) + tuple(k % edge for edge in border_edges for k in border_props) ) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tcPr=None): def __init__(self, namespace, tcPr=None):
self.namespace = namespace
if tcPr is None: if tcPr is None:
for p in self.all_properties: for p in self.all_properties:
setattr(self, p, inherit) setattr(self, p, inherit)
else: else:
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'): for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
f = globals()['read_%s' % x] f = globals()['read_%s' % x]
f(tcPr, self) f(tcPr, self, namespace.XPath, namespace.get)
self.row_span = inherit self.row_span = inherit
self._css = None self._css = None
@ -270,7 +273,8 @@ class TableStyle(Style):
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look',
) + tuple(k % edge for edge in border_edges for k in border_props) ) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tblPr=None): def __init__(self, namespace, tblPr=None):
self.namespace = namespace
if tblPr is None: if tblPr is None:
for p in self.all_properties: for p in self.all_properties:
setattr(self, p, inherit) setattr(self, p, inherit)
@ -278,23 +282,23 @@ class TableStyle(Style):
self.overrides = inherit self.overrides = inherit
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'): for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
f = globals()['read_%s' % x] f = globals()['read_%s' % x]
f(tblPr, self) f(tblPr, self, self.namespace.XPath, self.namespace.get)
parent = tblPr.getparent() parent = tblPr.getparent()
if is_tag(parent, 'w:style'): if self.namespace.is_tag(parent, 'w:style'):
self.overrides = {} self.overrides = {}
for tblStylePr in XPath('./w:tblStylePr[@w:type]')(parent): for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
otype = get(tblStylePr, 'w:type') otype = self.namespace.get(tblStylePr, 'w:type')
orides = self.overrides[otype] = {} orides = self.overrides[otype] = {}
for tblPr in XPath('./w:tblPr')(tblStylePr): for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
orides['table'] = TableStyle(tblPr) orides['table'] = TableStyle(self.namespace, tblPr)
for trPr in XPath('./w:trPr')(tblStylePr): for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(trPr) orides['row'] = RowStyle(self.namespace, trPr)
for tcPr in XPath('./w:tcPr')(tblStylePr): for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
orides['cell'] = CellStyle(tcPr) orides['cell'] = CellStyle(self.namespace, tcPr)
for pPr in XPath('./w:pPr')(tblStylePr): for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(pPr) orides['para'] = ParagraphStyle(self.namespace, pPr)
for rPr in XPath('./w:rPr')(tblStylePr): for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(rPr) orides['run'] = RunStyle(self.namespace, rPr)
self._css = None self._css = None
def resolve_based_on(self, parent): def resolve_based_on(self, parent):
@ -343,16 +347,17 @@ class TableStyle(Style):
class Table(object): class Table(object):
def __init__(self, tbl, styles, para_map, is_sub_table=False): def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
self.namespace = namespace
self.tbl = tbl self.tbl = tbl
self.styles = styles self.styles = styles
self.is_sub_table = is_sub_table self.is_sub_table = is_sub_table
# Read Table Style # Read Table Style
style = {'table':TableStyle()} style = {'table':TableStyle(self.namespace)}
for tblPr in XPath('./w:tblPr')(tbl): for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
for ts in XPath('./w:tblStyle[@w:val]')(tblPr): for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
style_id = get(ts, 'w:val') style_id = self.namespace.get(ts, 'w:val')
s = styles.get(style_id) s = styles.get(style_id)
if s is not None: if s is not None:
if s.table_style is not None: if s.table_style is not None:
@ -367,7 +372,7 @@ class Table(object):
style['run'].update(s.character_style) style['run'].update(s.character_style)
else: else:
style['run'] = s.character_style style['run'] = s.character_style
style['table'].update(TableStyle(tblPr)) style['table'].update(TableStyle(self.namespace, tblPr))
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None) self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
self.run_style = style.get('run', None) self.run_style = style.get('run', None)
self.overrides = self.table_style.overrides self.overrides = self.table_style.overrides
@ -380,23 +385,23 @@ class Table(object):
self.paragraphs = [] self.paragraphs = []
self.cell_map = [] self.cell_map = []
rows = XPath('./w:tr')(tbl) rows = self.namespace.XPath('./w:tr')(tbl)
for r, tr in enumerate(rows): for r, tr in enumerate(rows):
overrides = self.get_overrides(r, None, len(rows), None) overrides = self.get_overrides(r, None, len(rows), None)
self.resolve_row_style(tr, overrides) self.resolve_row_style(tr, overrides)
cells = XPath('./w:tc')(tr) cells = self.namespace.XPath('./w:tc')(tr)
self.cell_map.append([]) self.cell_map.append([])
for c, tc in enumerate(cells): for c, tc in enumerate(cells):
overrides = self.get_overrides(r, c, len(rows), len(cells)) overrides = self.get_overrides(r, c, len(rows), len(cells))
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells)) self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
self.cell_map[-1].append(tc) self.cell_map[-1].append(tc)
for p in XPath('./w:p')(tc): for p in self.namespace.XPath('./w:p')(tc):
para_map[p] = self para_map[p] = self
self.paragraphs.append(p) self.paragraphs.append(p)
self.resolve_para_style(p, overrides) self.resolve_para_style(p, overrides)
self.handle_merged_cells() self.handle_merged_cells()
self.sub_tables = {x:Table(x, styles, para_map, is_sub_table=True) for x in XPath('./w:tr/w:tc/w:tbl')(tbl)} self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
def override_allowed(self, name): def override_allowed(self, name):
'Check if the named override is allowed by the tblLook element' 'Check if the named override is allowed by the tblLook element'
@ -449,7 +454,7 @@ class Table(object):
return tuple(filter(self.override_allowed, overrides)) return tuple(filter(self.override_allowed, overrides))
def resolve_row_style(self, tr, overrides): def resolve_row_style(self, tr, overrides):
rs = RowStyle() rs = RowStyle(self.namespace)
for o in overrides: for o in overrides:
if o in self.overrides: if o in self.overrides:
ovr = self.overrides[o] ovr = self.overrides[o]
@ -457,12 +462,12 @@ class Table(object):
if ors is not None: if ors is not None:
rs.update(ors) rs.update(ors)
for trPr in XPath('./w:trPr')(tr): for trPr in self.namespace.XPath('./w:trPr')(tr):
rs.update(RowStyle(trPr)) rs.update(RowStyle(self.namespace, trPr))
self.style_map[tr] = rs self.style_map[tr] = rs
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row): def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
cs = CellStyle() cs = CellStyle(self.namespace)
# from lxml.etree import tostring # from lxml.etree import tostring
# txt = tostring(tc, method='text', encoding=unicode) # txt = tostring(tc, method='text', encoding=unicode)
for o in overrides: for o in overrides:
@ -472,8 +477,8 @@ class Table(object):
if ors is not None: if ors is not None:
cs.update(ors) cs.update(ors)
for tcPr in XPath('./w:tcPr')(tc): for tcPr in self.namespace.XPath('./w:tcPr')(tc):
cs.update(CellStyle(tcPr)) cs.update(CellStyle(self.namespace, tcPr))
for x in edges: for x in edges:
p = 'cell_padding_%s' % x p = 'cell_padding_%s' % x
@ -535,7 +540,7 @@ class Table(object):
try: try:
s = self.style_map[cell] s = self.style_map[cell]
except KeyError: # cell is None except KeyError: # cell is None
s = CellStyle() s = CellStyle(self.namespace)
if s.vMerge == 'restart': if s.vMerge == 'restart':
runs.append([cell]) runs.append([cell])
elif s.vMerge == 'continue': elif s.vMerge == 'continue':
@ -555,7 +560,7 @@ class Table(object):
try: try:
s = self.style_map[cell] s = self.style_map[cell]
except KeyError: # cell is None except KeyError: # cell is None
s = CellStyle() s = CellStyle(self.namespace)
if s.col_span is not inherit: if s.col_span is not inherit:
runs.append([]) runs.append([])
continue continue
@ -593,12 +598,12 @@ class Table(object):
parent.insert(idx, table) parent.insert(idx, table)
else: else:
parent.append(table) parent.append(table)
for row in XPath('./w:tr')(self.tbl): for row in self.namespace.XPath('./w:tr')(self.tbl):
tr = TR('\n\t\t\t') tr = TR('\n\t\t\t')
style_map[tr] = self.style_map[row] style_map[tr] = self.style_map[row]
tr.tail = '\n\t\t' tr.tail = '\n\t\t'
table.append(tr) table.append(tr)
for tc in XPath('./w:tc')(row): for tc in self.namespace.XPath('./w:tc')(row):
td = TD() td = TD()
style_map[td] = s = self.style_map[tc] style_map[td] = s = self.style_map[tc]
if s.col_span is not inherit: if s.col_span is not inherit:
@ -607,7 +612,7 @@ class Table(object):
td.set('rowspan', type('')(s.row_span)) td.set('rowspan', type('')(s.row_span))
td.tail = '\n\t\t\t' td.tail = '\n\t\t\t'
tr.append(td) tr.append(td)
for x in XPath('./w:p|./w:tbl')(tc): for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
if x.tag.endswith('}p'): if x.tag.endswith('}p'):
td.append(rmap[x]) td.append(rmap[x])
else: else:
@ -627,15 +632,16 @@ class Table(object):
class Tables(object): class Tables(object):
def __init__(self): def __init__(self, namespace):
self.tables = [] self.tables = []
self.para_map = {} self.para_map = {}
self.sub_tables = set() self.sub_tables = set()
self.namespace = namespace
def register(self, tbl, styles): def register(self, tbl, styles):
if tbl in self.sub_tables: if tbl in self.sub_tables:
return return
self.tables.append(Table(tbl, styles, self.para_map)) self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
self.sub_tables |= set(self.tables[-1].sub_tables) self.sub_tables |= set(self.tables[-1].sub_tables)
def apply_markup(self, object_map, page_map): def apply_markup(self, object_map, page_map):

View File

@ -6,22 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath
class Theme(object): class Theme(object):
def __init__(self): def __init__(self, namespace):
self.major_latin_font = 'Cambria' self.major_latin_font = 'Cambria'
self.minor_latin_font = 'Calibri' self.minor_latin_font = 'Calibri'
self.namespace = namespace
def __call__(self, root): def __call__(self, root):
for fs in XPath('//a:fontScheme')(root): for fs in self.namespace.XPath('//a:fontScheme')(root):
for mj in XPath('./a:majorFont')(fs): for mj in self.namespace.XPath('./a:majorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj): for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.major_latin_font = l.get('typeface') self.major_latin_font = l.get('typeface')
for mj in XPath('./a:minorFont')(fs): for mj in self.namespace.XPath('./a:minorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj): for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.minor_latin_font = l.get('typeface') self.minor_latin_font = l.get('typeface')
def resolve_font_family(self, ff): def resolve_font_family(self, ff):

View File

@ -15,9 +15,7 @@ from lxml.html.builder import (
from calibre import guess_type from calibre import guess_type
from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import ( from calibre.ebooks.docx.names import XML, generate_anchor
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.fonts import Fonts
@ -54,6 +52,7 @@ class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
self.docx = DOCX(path_or_stream, log=log) self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}') self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]') self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log self.log = self.docx.log
@ -62,12 +61,12 @@ class Convert(object):
self.dest_dir = dest_dir or os.getcwdu() self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata self.mi = self.docx.metadata
self.body = BODY() self.body = BODY()
self.theme = Theme() self.theme = Theme(self.namespace)
self.settings = Settings() self.settings = Settings(self.namespace)
self.tables = Tables() self.tables = Tables(self.namespace)
self.fields = Fields() self.fields = Fields(self.namespace)
self.styles = Styles(self.tables) self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.log) self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict() self.object_map = OrderedDict()
self.html = HTML( self.html = HTML(
HEAD( HEAD(
@ -211,7 +210,7 @@ class Convert(object):
html_obj.set('class', cls) html_obj.set('class', cls)
if notes_header is not None: if notes_header is not None:
for h in children(self.body, 'h1', 'h2', 'h3'): for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag notes_header.tag = h.tag
cls = h.get('class', None) cls = h.get('class', None)
if cls and cls != 'notes-header': if cls and cls != 'notes-header':
@ -221,7 +220,7 @@ class Convert(object):
self.fields.polish_markup(self.object_map) self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word') self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
return self.write(doc) return self.write(doc)
@ -230,14 +229,14 @@ class Convert(object):
self.page_map = OrderedDict() self.page_map = OrderedDict()
self.section_starts = [] self.section_starts = []
for p in descendants(doc, 'w:p', 'w:tbl'): for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'): if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles) self.tables.register(p, self.styles)
current.append(p) current.append(p)
continue continue
sect = tuple(descendants(p, 'w:sectPr')) sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
if sect: if sect:
pr = PageProperties(sect) pr = PageProperties(self.namespace, sect)
paras = current + [p] paras = current + [p]
for x in paras: for x in paras:
self.page_map[x] = pr self.page_map[x] = pr
@ -248,8 +247,8 @@ class Convert(object):
if current: if current:
self.section_starts.append(current[0]) self.section_starts.append(current[0])
last = XPath('./w:body/w:sectPr')(doc) last = self.namespace.XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(last) pr = PageProperties(self.namespace, last)
for x in current: for x in current:
self.page_map[x] = pr self.page_map[x] = pr
@ -264,16 +263,16 @@ class Convert(object):
name = name name = name
return name return name
nname = get_name(NUMBERING, 'numbering.xml') nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
sname = get_name(STYLES, 'styles.xml') sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
sename = get_name(SETTINGS, 'settings.xml') sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
fname = get_name(FONTS, 'fontTable.xml') fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
tname = get_name(THEMES, 'theme1.xml') tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml') foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
enname = get_name(ENDNOTES, 'endnotes.xml') enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
numbering = self.numbering = Numbering() numbering = self.numbering = Numbering(self.namespace)
footnotes = self.footnotes = Footnotes() footnotes = self.footnotes = Footnotes(self.namespace)
fonts = self.fonts = Fonts() fonts = self.fonts = Fonts(self.namespace)
foraw = enraw = None foraw = enraw = None
forel, enrel = ({}, {}), ({}, {}) forel, enrel = ({}, {}), ({}, {})
@ -337,7 +336,7 @@ class Convert(object):
self.styles.resolve_numbering(numbering) self.styles.resolve_numbering(numbering)
def write(self, doc): def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log) toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>') raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw) f.write(raw)
@ -363,11 +362,11 @@ class Convert(object):
return os.path.join(self.dest_dir, 'metadata.opf') return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc): def read_block_anchors(self, doc):
doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors: if doc_anchors:
current_bm = set() current_bm = set()
rmap = {v:k for k, v in self.object_map.iteritems()} rmap = {v:k for k, v in self.object_map.iteritems()}
for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'): if p.tag.endswith('}p'):
if current_bm and p in rmap: if current_bm and p in rmap:
para = rmap[p] para = rmap[p]
@ -377,7 +376,7 @@ class Convert(object):
self.anchor_map[name] = para.get('id') self.anchor_map[name] = para.get('id')
current_bm = set() current_bm = set()
elif p in doc_anchors: elif p in doc_anchors:
anchor = get(p, 'w:name') anchor = self.namespace.get(p, 'w:name')
if anchor: if anchor:
current_bm.add(anchor) current_bm.add(anchor)
@ -390,7 +389,7 @@ class Convert(object):
current_anchor = None current_anchor = None
current_hyperlink = None current_hyperlink = None
hl_xpath = XPath('ancestor::w:hyperlink[1]') hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
def p_parent(x): def p_parent(x):
# Ensure that nested <w:p> tags are handled. These can occur if a # Ensure that nested <w:p> tags are handled. These can occur if a
@ -403,7 +402,7 @@ class Convert(object):
except AttributeError: except AttributeError:
break break
for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
if p_parent(x) is not p: if p_parent(x) is not p:
continue continue
if x.tag.endswith('}r'): if x.tag.endswith('}r'):
@ -422,7 +421,7 @@ class Convert(object):
dest.append(span) dest.append(span)
self.layers[p].append(x) self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'): elif x.tag.endswith('}bookmarkStart'):
anchor = get(x, 'w:name') anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map: if anchor and anchor not in self.anchor_map:
old_anchor = current_anchor old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
@ -502,17 +501,17 @@ class Convert(object):
span = self.wrap_elems(spans, SPAN()) span = self.wrap_elems(spans, SPAN())
span.tag = 'a' span.tag = 'a'
self.resolved_link_map[hyperlink] = span self.resolved_link_map[hyperlink] = span
tgt = get(hyperlink, 'w:tgtFrame') tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
if tgt: if tgt:
span.set('target', tgt) span.set('target', tgt)
tt = get(hyperlink, 'w:tooltip') tt = self.namespace.get(hyperlink, 'w:tooltip')
if tt: if tt:
span.set('title', tt) span.set('title', tt)
rid = get(hyperlink, 'r:id') rid = self.namespace.get(hyperlink, 'r:id')
if rid and rid in relationships_by_id: if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid]) span.set('href', relationships_by_id[rid])
continue continue
anchor = get(hyperlink, 'w:anchor') anchor = self.namespace.get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map: if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor]) span.set('href', '#' + self.anchor_map[anchor])
continue continue
@ -576,7 +575,7 @@ class Convert(object):
text = Text(ans, 'text', []) text = Text(ans, 'text', [])
for child in run: for child in run:
if is_tag(child, 'w:t'): if self.namespace.is_tag(child, 'w:t'):
if not child.text: if not child.text:
continue continue
space = child.get(XML('space'), None) space = child.get(XML('space'), None)
@ -596,11 +595,11 @@ class Convert(object):
ans.append(text.elem) ans.append(text.elem)
else: else:
text.buf.append(ctext) text.buf.append(ctext)
elif is_tag(child, 'w:cr'): elif self.namespace.is_tag(child, 'w:cr'):
text.add_elem(BR()) text.add_elem(BR())
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:br'): elif self.namespace.is_tag(child, 'w:br'):
typ = get(child, 'w:type') typ = self.namespace.get(child, 'w:type')
if typ in {'column', 'page'}: if typ in {'column', 'page'}:
br = BR(style='page-break-after:always') br = BR(style='page-break-after:always')
else: else:
@ -611,25 +610,25 @@ class Convert(object):
br = BR() br = BR()
text.add_elem(br) text.add_elem(br)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'): elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img) text.add_elem(img)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'): elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child) anchor, name = self.footnotes.get_ref(child)
if anchor and name: if anchor and name:
l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor) l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
l.set('class', 'noteref') l.set('class', 'noteref')
text.add_elem(l) text.add_elem(l)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:tab'): elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6)) spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces)) text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem) ans.append(text.elem)
ans[-1].set('class', 'tab') ans[-1].set('class', 'tab')
elif is_tag(child, 'w:noBreakHyphen'): elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
text.buf.append(u'\u2011') text.buf.append(u'\u2011')
elif is_tag(child, 'w:softHyphen'): elif self.namespace.is_tag(child, 'w:softHyphen'):
text.buf.append(u'\u00ad') text.buf.append(u'\u00ad')
if text.buf: if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf)) setattr(text.elem, text.attr, ''.join(text.buf))

View File

@ -10,7 +10,6 @@ from collections import namedtuple
from lxml.etree import tostring from lxml.etree import tostring
from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -21,8 +20,9 @@ class Count(object):
def __init__(self): def __init__(self):
self.val = 0 self.val = 0
def from_headings(body, log): def from_headings(body, log, namespace):
' Create a TOC from headings in the document ' ' Create a TOC from headings in the document '
XPath, descendants = namespace.XPath, namespace.descendants
headings = ('h1', 'h2', 'h3') headings = ('h1', 'h2', 'h3')
tocroot = TOC() tocroot = TOC()
xpaths = [XPath('//%s' % x) for x in headings] xpaths = [XPath('//%s' % x) for x in headings]
@ -99,7 +99,8 @@ def link_to_txt(a, styles, object_map):
return tostring(a, method='text', with_tail=False, encoding=unicode).strip() return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
def from_toc(docx, link_map, styles, object_map, log): def from_toc(docx, link_map, styles, object_map, log, namespace):
XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
toc_level = None toc_level = None
level = 0 level = 0
TI = namedtuple('TI', 'text anchor indent') TI = namedtuple('TI', 'text anchor indent')
@ -136,7 +137,5 @@ def from_toc(docx, link_map, styles, object_map, log):
log('Found Word Table of Contents, using it to generate the Table of Contents') log('Found Word Table of Contents, using it to generate the Table of Contents')
return structure_toc(toc) return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map, log): def create_toc(docx, body, link_map, styles, object_map, log, namespace):
return from_toc(docx, link_map, styles, object_map, log) or from_headings(body, log) return from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)

View File

@ -13,7 +13,7 @@ from lxml.builder import ElementMaker
from calibre import guess_type from calibre import guess_type
from calibre.constants import numeric_version, __appname__ from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES, FONTS from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
from calibre.utils.date import utcnow from calibre.utils.date import utcnow
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -27,7 +27,8 @@ def xml2str(root, pretty_print=False, with_tail=False):
pretty_print=pretty_print, with_tail=with_tail) pretty_print=pretty_print, with_tail=with_tail)
return ans return ans
def create_skeleton(opts): def create_skeleton(opts, namespaces=None):
namespaces = namespaces or DOCXNamespace().namespaces
def w(x): def w(x):
return '{%s}%s' % (namespaces['w'], x) return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}} dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
@ -70,9 +71,9 @@ def create_skeleton(opts):
return doc, styles, body return doc, styles, body
def update_doc_props(root, mi): def update_doc_props(root, mi, namespace):
def setm(name, text=None, ns='dc'): def setm(name, text=None, ns='dc'):
ans = root.makeelement('{%s}%s' % (namespaces[ns], name)) ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
for child in tuple(root): for child in tuple(root):
if child.tag == ans.tag: if child.tag == ans.tag:
root.remove(child) root.remove(child)
@ -92,12 +93,13 @@ def update_doc_props(root, mi):
class DocumentRelationships(object): class DocumentRelationships(object):
def __init__(self): def __init__(self, namespace):
self.rmap = {} self.rmap = {}
self.namespace = namespace
for typ, target in { for typ, target in {
STYLES: 'styles.xml', namespace.names['STYLES']: 'styles.xml',
WEB_SETTINGS: 'webSettings.xml', namespace.names['WEB_SETTINGS']: 'webSettings.xml',
FONTS: 'fontTable.xml', namespace.names['FONTS']: 'fontTable.xml',
}.iteritems(): }.iteritems():
self.add_relationship(target, typ) self.add_relationship(target, typ)
@ -112,9 +114,10 @@ class DocumentRelationships(object):
return ans return ans
def add_image(self, target): def add_image(self, target):
return self.add_relationship(target, IMAGES) return self.add_relationship(target, self.namespace.names['IMAGES'])
def serialize(self): def serialize(self):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships() relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.iteritems(): for (target, rtype, target_mode), rid in self.rmap.iteritems():
@ -127,8 +130,10 @@ class DocumentRelationships(object):
class DOCX(object): class DOCX(object):
def __init__(self, opts, log): def __init__(self, opts, log):
self.namespace = DOCXNamespace()
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships() self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
self.embedded_fonts = E.Relationships() self.embedded_fonts = E.Relationships()
@ -138,7 +143,7 @@ class DOCX(object):
# Boilerplate {{{ # Boilerplate {{{
@property @property
def contenttypes(self): def contenttypes(self):
E = ElementMaker(namespace=namespaces['ct'], nsmap={None:namespaces['ct']}) E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
types = E.Types() types = E.Types()
for partname, mt in { for partname, mt in {
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml", "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
@ -174,7 +179,7 @@ class DOCX(object):
@property @property
def appproperties(self): def appproperties(self):
E = ElementMaker(namespace=namespaces['ep'], nsmap={None:namespaces['ep']}) E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
props = E.Properties( props = E.Properties(
E.Application(__appname__), E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]), E.AppVersion('%02d.%04d' % numeric_version[:2]),
@ -193,14 +198,14 @@ class DOCX(object):
return textwrap.dedent(b'''\ return textwrap.dedent(b'''\
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"> <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/> <Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/> <Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/> <Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>''') </Relationships>'''.format(**self.namespace.names))
@property @property
def websettings(self): def websettings(self):
E = ElementMaker(namespace=namespaces['w'], nsmap={'w':namespaces['w']}) E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
ws = E.webSettings( ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile) E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws) return xml2str(ws)
@ -208,6 +213,7 @@ class DOCX(object):
# }}} # }}}
def convert_metadata(self, mi): def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()}) E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre')) cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat(str('T')).rpartition('.')[0] + 'Z' ts = utcnow().isoformat(str('T')).rpartition('.')[0] + 'Z'
@ -216,7 +222,7 @@ class DOCX(object):
x.text = ts x.text = ts
cp.append(x) cp.append(x)
self.mi = mi self.mi = mi
update_doc_props(cp, self.mi) update_doc_props(cp, self.mi, self.namespace)
return xml2str(cp) return xml2str(cp)
def create_empty_document(self, mi): def create_empty_document(self, mi):

View File

@ -9,7 +9,6 @@ __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict from collections import defaultdict
from uuid import uuid4 from uuid import uuid4
from calibre.ebooks.docx.names import makeelement, EMBEDDED_FONT
from calibre.ebooks.oeb.base import OEB_STYLES from calibre.ebooks.oeb.base import OEB_STYLES
from calibre.ebooks.oeb.transforms.subset import find_font_face_rules from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
@ -21,10 +20,12 @@ def obfuscate_font_data(data, key):
class FontsManager(object): class FontsManager(object):
def __init__(self, oeb, opts): def __init__(self, namespace, oeb, opts):
self.namespace = namespace
self.oeb, self.log, self.opts = oeb, oeb.log, opts self.oeb, self.log, self.opts = oeb, oeb.log, opts
def serialize(self, text_styles, fonts, embed_relationships, font_data_map): def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
makeelement = self.namespace.makeelement
font_families, seen = set(), set() font_families, seen = set(), set()
for ts in text_styles: for ts in text_styles:
if ts.font_family: if ts.font_family:
@ -68,7 +69,7 @@ class FontsManager(object):
if rid is None: if rid is None:
rel_map[item] = rid = 'rId%d' % num rel_map[item] = rid = 'rId%d' % num
fname = 'fonts/font%d.odttf' % num fname = 'fonts/font%d.odttf' % num
makeelement(embed_relationships, 'Relationship', Id=rid, Type=EMBEDDED_FONT, Target=fname) makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
font_data_map['word/' + fname] = obfuscate_font_data(item.data, key) font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
makeelement(font, 'w:embed' + tag, r_id=rid, makeelement(font, 'w:embed' + tag, r_id=rid,
w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(), w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),

View File

@ -9,7 +9,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from calibre.ebooks.docx.writer.container import create_skeleton from calibre.ebooks.docx.writer.container import create_skeleton
from calibre.ebooks.docx.writer.styles import w, StylesManager from calibre.ebooks.docx.writer.styles import StylesManager
from calibre.ebooks.docx.writer.images import ImagesManager from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table from calibre.ebooks.docx.writer.tables import Table
@ -45,12 +45,13 @@ class TextRun(object):
ws_pat = None ws_pat = None
def __init__(self, style, first_html_parent): def __init__(self, namespace, style, first_html_parent):
self.first_html_parent = first_html_parent self.first_html_parent = first_html_parent
if self.ws_pat is None: if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
self.style = style self.style = style
self.texts = [] self.texts = []
self.makelement = namespace.makeelement
def add_text(self, text, preserve_whitespace): def add_text(self, text, preserve_whitespace):
if not preserve_whitespace: if not preserve_whitespace:
@ -68,19 +69,18 @@ class TextRun(object):
self.texts.append((drawing, None)) self.texts.append((drawing, None))
def serialize(self, p): def serialize(self, p):
r = p.makeelement(w('r')) makeelement = self.makelement
p.append(r) r = makeelement(p, 'w:r')
rpr = r.makeelement(w('rPr')) rpr = makeelement(r, 'w:rPr')
rpr.append(rpr.makeelement(w('rStyle'), **{w('val'):self.style.id})) makeelement(rpr, 'w:rStyle', w_val=self.style.id)
r.append(rpr)
for text, preserve_whitespace in self.texts: for text, preserve_whitespace in self.texts:
if text is None: if text is None:
r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace})) makeelement(r, 'w:br', w_clear=preserve_whitespace)
elif hasattr(text, 'xpath'): elif hasattr(text, 'xpath'):
r.append(text) r.append(text)
else: else:
t = r.makeelement(w('t')) t = makeelement(r, 'w:t')
r.append(t)
t.text = text or '' t.text = text or ''
if preserve_whitespace: if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
@ -94,7 +94,8 @@ class TextRun(object):
class Block(object): class Block(object):
def __init__(self, styles_manager, html_block, style, is_table_cell=False): def __init__(self, namespace, styles_manager, html_block, style, is_table_cell=False):
self.namespace = namespace
self.html_block = html_block self.html_block = html_block
self.html_style = style self.html_style = style
self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell) self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell)
@ -109,7 +110,7 @@ class Block(object):
if self.runs and ts == self.runs[-1].style: if self.runs and ts == self.runs[-1].style:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(ts, self.html_block if html_parent is None else html_parent) run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent)
self.runs.append(run) self.runs.append(run)
preserve_whitespace = ws in {'pre', 'pre-wrap'} preserve_whitespace = ws in {'pre', 'pre-wrap'}
if ignore_leading_whitespace and not preserve_whitespace: if ignore_leading_whitespace and not preserve_whitespace:
@ -125,7 +126,7 @@ class Block(object):
if self.runs: if self.runs:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block) run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run) self.runs.append(run)
run.add_break(clear=clear) run.add_break(clear=clear)
@ -133,20 +134,19 @@ class Block(object):
if self.runs: if self.runs:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block) run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run) self.runs.append(run)
run.add_image(drawing) run.add_image(drawing)
def serialize(self, body): def serialize(self, body):
p = body.makeelement(w('p')) makeelement = self.namespace.makeelement
body.append(p) p = makeelement(body, 'w:p')
ppr = p.makeelement(w('pPr')) ppr = makeelement(p, 'w:pPr')
p.append(ppr)
if self.keep_next: if self.keep_next:
ppr.append(ppr.makeelement(w('keepNext'))) makeelement(ppr, 'w:keepNext')
if self.page_break_before: if self.page_break_before:
ppr.append(ppr.makeelement(w('pageBreakBefore'))) makeelement(ppr, 'w:pageBreakBefore')
ppr.append(ppr.makeelement(w('pStyle'), **{w('val'):self.style.id})) makeelement(ppr, 'w:pStyle', w_val=self.style.id)
for run in self.runs: for run in self.runs:
run.serialize(p) run.serialize(p)
@ -158,7 +158,8 @@ class Block(object):
class Blocks(object): class Blocks(object):
def __init__(self, styles_manager): def __init__(self, namespace, styles_manager):
self.namespace = namespace
self.styles_manager = styles_manager self.styles_manager = styles_manager
self.all_blocks = [] self.all_blocks = []
self.pos = 0 self.pos = 0
@ -183,12 +184,12 @@ class Blocks(object):
def start_new_block(self, html_block, style, is_table_cell=False): def start_new_block(self, html_block, style, is_table_cell=False):
self.end_current_block() self.end_current_block()
self.current_block = Block(self.styles_manager, html_block, style, is_table_cell=is_table_cell) self.current_block = Block(self.namespace, self.styles_manager, html_block, style, is_table_cell=is_table_cell)
self.open_html_blocks.add(html_block) self.open_html_blocks.add(html_block)
return self.current_block return self.current_block
def start_new_table(self, html_tag, tag_style=None): def start_new_table(self, html_tag, tag_style=None):
self.current_table = Table(html_tag, tag_style) self.current_table = Table(self.namespace, html_tag, tag_style)
self.tables.append(self.current_table) self.tables.append(self.current_table)
def start_new_row(self, html_tag, tag_style): def start_new_row(self, html_tag, tag_style):
@ -252,10 +253,10 @@ class Convert(object):
self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer = SVGRasterizer()
self.svg_rasterizer(self.oeb, self.opts) self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager() self.styles_manager = StylesManager(self.docx.namespace)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.fonts_manager = FontsManager(self.oeb, self.opts) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.styles_manager) self.blocks = Blocks(self.docx.namespace, self.styles_manager)
for item in self.oeb.spine: for item in self.oeb.spine:
self.process_item(item) self.process_item(item)

View File

@ -15,7 +15,6 @@ from future_builtins import map
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import urlunquote from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.docx.names import makeelement, namespaces
from calibre.ebooks.docx.images import pt_to_emu from calibre.ebooks.docx.images import pt_to_emu
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
@ -68,6 +67,8 @@ class ImagesManager(object):
name = urlunquote(posixpath.basename(href)) name = urlunquote(posixpath.basename(href))
width, height = map(pt_to_emu, style.img_size(img.width, img.height)) width, height = map(pt_to_emu, style.img_size(img.width, img.height))
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
root = etree.Element('root', nsmap=namespaces) root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False) ans = makeelement(root, 'w:drawing', append=False)
if floating is None: if floating is None:

View File

@ -12,7 +12,6 @@ from operator import attrgetter
from lxml import etree from lxml import etree
from calibre.ebooks import parse_css_length from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.icu import numeric_sort_key from calibre.utils.icu import numeric_sort_key
from tinycss.css21 import CSS21Parser from tinycss.css21 import CSS21Parser
@ -38,12 +37,6 @@ def css_font_family_to_docx(raw):
for ff in parse_css_font_family(raw): for ff in parse_css_font_family(raw):
return generic.get(ff.lower(), ff) return generic.get(ff.lower(), ff)
def w(x):
return '{%s}%s' % (namespaces['w'], x)
def makeelement(parent, name, **attrs):
return parent.makeelement(w(name), **{w(k):v for k, v in attrs.iteritems()})
def bmap(x): def bmap(x):
return 'on' if x else 'off' return 'on' if x else 'off'
@ -52,12 +45,17 @@ class DOCXStyle(object):
ALL_PROPS = () ALL_PROPS = ()
TYPE = 'paragraph' TYPE = 'paragraph'
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
self._hash = hash(tuple( self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS)) getattr(self, x) for x in self.ALL_PROPS))
self.id = self.name = None self.id = self.name = None
self.next_style = None self.next_style = None
def makeelement(self, parent, name, **attrs):
return parent.makeelement(self.w(name), **{self.w(k):v for k, v in attrs.iteritems()})
def __hash__(self): def __hash__(self):
return self._hash return self._hash
@ -71,10 +69,11 @@ class DOCXStyle(object):
return not self == other return not self == other
def __repr__(self): def __repr__(self):
return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':namespaces['w']})), pretty_print=True) return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
__str__ = __repr__ __str__ = __repr__
def serialize(self, styles, normal_style): def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE) style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name)) style.append(makeelement(style, 'name', val=self.name))
if self is normal_style: if self is normal_style:
@ -106,7 +105,7 @@ class TextStyle(DOCXStyle):
'border_style', 'border_width', 'border_color') 'border_style', 'border_width', 'border_color')
TYPE = 'character' TYPE = 'character'
def __init__(self, css, is_parent_style=False): def __init__(self, namespace, css, is_parent_style=False):
self.font_family = css_font_family_to_docx(css['font-family']) self.font_family = css_font_family_to_docx(css['font-family'])
try: try:
self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts
@ -163,9 +162,10 @@ class TextStyle(DOCXStyle):
elif self.border_style != style: elif self.border_style != style:
self.border_style = ignore self.border_style = ignore
DOCXStyle.__init__(self) DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style): def serialize_borders(self, bdr, normal_style):
w = self.w
if (self.padding not in (None, ignore, 0) and self is normal_style) or self.padding != normal_style.padding: if (self.padding not in (None, ignore, 0) and self is normal_style) or self.padding != normal_style.padding:
bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding)) bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding))
if (self.border_width not in (None, ignore, 0) and self is normal_style) or self.border_width != normal_style.border_width: if (self.border_width not in (None, ignore, 0) and self is normal_style) or self.border_width != normal_style.border_width:
@ -177,6 +177,7 @@ class TextStyle(DOCXStyle):
return bdr return bdr
def serialize(self, styles, normal_style): def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style) style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr') style = makeelement(style_root, 'rPr')
@ -273,7 +274,7 @@ class BlockStyle(DOCXStyle):
[x%edge for edge in border_edges for x in border_props] [x%edge for edge in border_edges for x in border_props]
) )
def __init__(self, css, html_block, is_table_cell=False): def __init__(self, namespace, css, html_block, is_table_cell=False):
read_css_block_borders(self, css) read_css_block_borders(self, css)
if is_table_cell: if is_table_cell:
for edge in border_edges: for edge in border_edges:
@ -298,9 +299,10 @@ class BlockStyle(DOCXStyle):
self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get( self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
css['text-align'].lower(), 'left') css['text-align'].lower(), 'left')
DOCXStyle.__init__(self) DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style): def serialize_borders(self, bdr, normal_style):
w = self.w
for edge in border_edges: for edge in border_edges:
e = bdr.makeelement(w(edge)) e = bdr.makeelement(w(edge))
padding = getattr(self, 'padding_' + edge) padding = getattr(self, 'padding_' + edge)
@ -319,6 +321,7 @@ class BlockStyle(DOCXStyle):
return bdr return bdr
def serialize(self, styles, normal_style): def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style) style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr') style = makeelement(style_root, 'pPr')
@ -393,11 +396,12 @@ class BlockStyle(DOCXStyle):
class StylesManager(object): class StylesManager(object):
def __init__(self): def __init__(self, namespace):
self.namespace = namespace
self.block_styles, self.text_styles = {}, {} self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style, is_parent_style=False): def create_text_style(self, css_style, is_parent_style=False):
ans = TextStyle(css_style, is_parent_style=is_parent_style) ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
existing = self.text_styles.get(ans, None) existing = self.text_styles.get(ans, None)
if existing is None: if existing is None:
self.text_styles[ans] = ans self.text_styles[ans] = ans
@ -406,7 +410,7 @@ class StylesManager(object):
return ans return ans
def create_block_style(self, css_style, html_block, is_table_cell=False): def create_block_style(self, css_style, html_block, is_table_cell=False):
ans = BlockStyle(css_style, html_block, is_table_cell=is_table_cell) ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell)
existing = self.block_styles.get(ans, None) existing = self.block_styles.get(ans, None)
if existing is None: if existing is None:
self.block_styles[ans] = ans self.block_styles[ans] = ans

View File

@ -8,7 +8,6 @@ __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple from collections import namedtuple
from calibre.ebooks.docx.names import makeelement
from calibre.ebooks.docx.writer.utils import convert_color from calibre.ebooks.docx.writer.utils import convert_color
from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
@ -29,7 +28,7 @@ class SpannedCell(object):
def resolve_borders(self): def resolve_borders(self):
pass pass
def serialize(self, tr): def serialize(self, tr, makeelement):
tc = makeelement(tr, 'w:tc') tc = makeelement(tr, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr') tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue') makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
@ -70,14 +69,6 @@ def convert_width(tag_style):
pass pass
return ('auto', 0) return ('auto', 0)
def serialize_border_edge(self, bdr, edge):
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
if width > 0 and bstyle != 'none':
makeelement(bdr, 'w:' + edge, w_val=bstyle, w_sz=str(width), w_color=getattr(self, 'border_%s_color' % edge))
return True
return False
class Cell(object): class Cell(object):
BLEVEL = 2 BLEVEL = 2
@ -107,7 +98,7 @@ class Cell(object):
self.items.append(table) self.items.append(table)
return table return table
def serialize(self, parent): def serialize(self, parent, makeelement):
tc = makeelement(parent, 'w:tc') tc = makeelement(parent, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr') tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=str(self.width[1])) makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=str(self.width[1]))
@ -240,16 +231,17 @@ class Row(object):
def add_table(self, table): def add_table(self, table):
return self.current_cell.add_table(table) return self.current_cell.add_table(table)
def serialize(self, parent): def serialize(self, parent, makeelement):
tr = makeelement(parent, 'w:tr') tr = makeelement(parent, 'w:tr')
for cell in self.cells: for cell in self.cells:
cell.serialize(tr) cell.serialize(tr, makeelement)
class Table(object): class Table(object):
BLEVEL = 0 BLEVEL = 0
def __init__(self, html_tag, tag_style=None): def __init__(self, namespace, html_tag, tag_style=None):
self.namespace = namespace
self.html_tag = html_tag self.html_tag = html_tag
self.rows = [] self.rows = []
self.current_row = None self.current_row = None
@ -329,6 +321,7 @@ class Table(object):
return self.current_row.add_table(table) return self.current_row.add_table(table)
def serialize(self, parent): def serialize(self, parent):
makeelement = self.namespace.makeelement
rows = [r for r in self.rows if r.cells] rows = [r for r in self.rows if r.cells]
if not rows: if not rows:
return return
@ -338,4 +331,4 @@ class Table(object):
if self.jc is not None: if self.jc is not None:
makeelement(tblPr, 'w:jc', w_val=self.jc) makeelement(tblPr, 'w:jc', w_val=self.jc)
for row in rows: for row in rows:
row.serialize(tbl) row.serialize(tbl, makeelement)

View File

@ -12,14 +12,14 @@ from io import BytesIO
from lxml import etree from lxml import etree
from calibre.ebooks.docx.container import DOCX from calibre.ebooks.docx.container import DOCX
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str, namespaces from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
from calibre.ebooks.docx.names import XPath, get
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
images = XPath('//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]')
def get_cover(docx): def get_cover(docx):
doc = docx.document doc = docx.document
get = docx.namespace.get
images = docx.namespace.XPath(
'//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]')
rid_map = docx.document_relationships[0] rid_map = docx.document_relationships[0]
for image in images(doc): for image in images(doc):
rid = get(image, 'r:embed') or get(image, 'r:id') rid = get(image, 'r:embed') or get(image, 'r:id')
@ -58,11 +58,11 @@ def set_metadata(stream, mi):
except Exception: except Exception:
ap_raw = None ap_raw = None
cp = etree.fromstring(dp_raw) cp = etree.fromstring(dp_raw)
update_doc_props(cp, mi) update_doc_props(cp, mi, c.namespace)
replacements = {} replacements = {}
if ap_raw is not None: if ap_raw is not None:
ap = etree.fromstring(ap_raw) ap = etree.fromstring(ap_raw)
comp = ap.makeelement('{%s}Company' % namespaces['ep']) comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
for child in tuple(ap): for child in tuple(ap):
if child.tag == comp.tag: if child.tag == comp.tag:
ap.remove(child) ap.remove(child)