DOCX: Refactor to support different namespace profiles

Needed to support the "Strict" mode DOCX files that Word 2013 can
optionally generate.
This commit is contained in:
Kovid Goyal 2015-04-10 13:39:24 +05:30
parent 341f011372
commit b9f86450a0
24 changed files with 441 additions and 413 deletions

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import XPath, get
class Inherit:
pass
inherit = Inherit()
def binary_property(parent, name):
def binary_property(parent, name, XPath, get):
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
@ -68,7 +67,7 @@ LINE_STYLES = { # {{{
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
def read_single_border(parent, edge):
def read_single_border(parent, edge, XPath, get):
color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color')
@ -95,19 +94,19 @@ def read_single_border(parent, edge):
width = 3 # WebKit needs 3pts to render double borders
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
def read_border(parent, dest, XPath, get, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent):
for edge in border_edges:
for prop, val in read_single_border(border, edge).iteritems():
for prop, val in read_single_border(border, edge, XPath, get).iteritems():
if val is not None:
vals[prop % edge] = val
for key, val in vals.iteritems():
setattr(dest, key, val)
def read_indent(parent, dest):
def read_indent(parent, dest, XPath, get):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
@ -133,7 +132,7 @@ def read_indent(parent, dest):
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest):
def read_justification(parent, dest, XPath, get):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
@ -145,7 +144,7 @@ def read_justification(parent, dest):
ans = val
setattr(dest, 'text_align', ans)
def read_spacing(parent, dest):
def read_spacing(parent, dest, XPath, get):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
@ -167,7 +166,7 @@ def read_spacing(parent, dest):
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
def read_direction(parent, dest):
def read_direction(parent, dest, XPath, get):
ans = inherit
for jc in XPath('./w:textFlow[@w:val]')(parent):
val = get(jc, 'w:val')
@ -177,7 +176,7 @@ def read_direction(parent, dest):
ans = 'rtl'
setattr(dest, 'direction', ans)
def read_shd(parent, dest):
def read_shd(parent, dest, XPath, get):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
@ -185,7 +184,7 @@ def read_shd(parent, dest):
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
def read_numbering(parent, dest):
def read_numbering(parent, dest, XPath, get):
lvl = num_id = None
for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np):
@ -203,7 +202,7 @@ class Frame(object):
all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
def __init__(self, fp):
def __init__(self, fp, XPath, get):
self.drop_cap = get(fp, 'w:dropCap', 'none')
try:
self.h = int(get(fp, 'w:h'))/20
@ -275,10 +274,10 @@ class Frame(object):
def __ne__(self, other):
return not self.__eq__(other)
def read_frame(parent, dest):
def read_frame(parent, dest, XPath, get):
ans = inherit
for fp in XPath('./w:framePr')(parent):
ans = Frame(fp)
ans = Frame(fp, XPath, get)
setattr(dest, 'frame', ans)
# }}}
@ -303,7 +302,8 @@ class ParagraphStyle(object):
'numbering', 'font_family', 'font_size', 'color', 'frame',
)
def __init__(self, pPr=None):
def __init__(self, namespace, pPr=None):
self.namespace = namespace
self.linked_style = None
if pPr is None:
for p in self.all_properties:
@ -315,14 +315,14 @@ class ParagraphStyle(object):
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
setattr(self, p, binary_property(pPr, p))
setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering', 'frame'):
f = globals()['read_%s' % x]
f(pPr, self)
f(pPr, self, namespace.XPath, namespace.get)
for s in XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = get(s, 'w:val')
for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = namespace.get(s, 'w:val')
self.font_family = self.font_size = self.color = inherit

View File

@ -9,10 +9,9 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.block_styles import ( # noqa
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
from calibre.ebooks.docx.names import XPath, get
# Read from XML {{{
def read_text_border(parent, dest):
def read_text_border(parent, dest, XPath, get):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems and elems[0].attrib:
@ -46,7 +45,7 @@ def read_text_border(parent, dest):
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest):
def read_color(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
@ -61,7 +60,7 @@ def convert_highlight_color(val):
'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
'lightGray': '#c0c0c0'}.get(val, val)
def read_highlight(parent, dest):
def read_highlight(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
@ -74,7 +73,7 @@ def read_highlight(parent, dest):
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest):
def read_lang(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
@ -91,7 +90,7 @@ def read_lang(parent, dest):
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest):
def read_letter_spacing(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
@ -99,7 +98,7 @@ def read_letter_spacing(parent, dest):
ans = val
setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest):
def read_sz(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
@ -107,7 +106,7 @@ def read_sz(parent, dest):
ans = val
setattr(dest, 'font_size', ans)
def read_underline(parent, dest):
def read_underline(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
@ -115,7 +114,7 @@ def read_underline(parent, dest):
ans = val if val == 'none' else 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest):
def read_vert_align(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
@ -123,7 +122,7 @@ def read_vert_align(parent, dest):
ans = val
setattr(dest, 'vert_align', ans)
def read_font_family(parent, dest):
def read_font_family(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:asciiTheme')
@ -150,7 +149,8 @@ class RunStyle(object):
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'dstrike', 'vanish',
}
def __init__(self, rPr=None):
def __init__(self, namespace, rPr=None):
self.namespace = namespace
self.linked_style = None
if rPr is None:
for p in self.all_properties:
@ -160,14 +160,14 @@ class RunStyle(object):
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish', 'webHidden',
):
setattr(self, p, binary_property(rPr, p))
setattr(self, p, binary_property(rPr, p, namespace.XPath, namespace.get))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'):
f = globals()['read_%s' % x]
f(rPr, self)
f(rPr, self, namespace.XPath, namespace.get)
for s in XPath('./w:rStyle[@w:val]')(rPr):
self.linked_style = get(s, 'w:val')
for s in namespace.XPath('./w:rStyle[@w:val]')(rPr):
self.linked_style = namespace.get(s, 'w:val')
self._css = None

View File

@ -8,7 +8,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from calibre.ebooks.docx.names import XPath
NBSP = '\xa0'
def mergeable(previous, current):
@ -99,7 +98,7 @@ def before_count(root, tag, limit=10):
if ans > limit:
return limit
def cleanup_markup(log, root, styles, dest_dir, detect_cover):
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
# Move <hr>s outside paragraphs, if possible.
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
for hr in root.xpath('//span/hr'):

View File

@ -14,7 +14,7 @@ from calibre import walk, guess_type
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.docx import InvalidDOCX
from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log
@ -25,7 +25,7 @@ def fromstring(raw, parser=RECOVER_PARSER):
return etree.fromstring(raw, parser=parser)
# Read metadata {{{
def read_doc_props(raw, mi):
def read_doc_props(raw, mi, XPath):
root = fromstring(raw)
titles = XPath('//dc:title')(root)
if titles:
@ -72,7 +72,7 @@ def read_app_props(raw, mi):
if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip()
def read_default_style_language(raw, mi):
def read_default_style_language(raw, mi, XPath):
root = fromstring(raw)
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
lang = canonicalize_lang(lang)
@ -84,6 +84,7 @@ def read_default_style_language(raw, mi):
class DOCX(object):
def __init__(self, path_or_stream, log=None, extract=True):
self.docx_is_transitional = True
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log
@ -93,6 +94,7 @@ class DOCX(object):
self.init_zipfile(stream)
self.read_content_types()
self.read_package_relationships()
self.namespace = DOCXNamespace(self.docx_is_transitional)
def init_zipfile(self, stream):
self.zipf = ZipFile(stream)
@ -158,12 +160,14 @@ class DOCX(object):
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target').lstrip('/')
typ = item.get('Type')
if target == 'word/document.xml':
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
self.relationships[typ] = target
self.relationships_rmap[target] = typ
@property
def document_name(self):
name = self.relationships.get(DOCUMENT, None)
name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
if not names:
@ -201,13 +205,13 @@ class DOCX(object):
return by_id, by_type
def get_document_properties_names(self):
name = self.relationships.get(DOCPROPS, None)
name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names:
name = names[0]
yield name
name = self.relationships.get(APPPROPS, None)
name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
@ -224,16 +228,16 @@ class DOCX(object):
except KeyError:
pass
else:
read_doc_props(raw, mi)
read_doc_props(raw, mi, self.namespace.XPath)
if mi.is_null('language'):
try:
raw = self.read('word/styles.xml')
except KeyError:
pass
else:
read_default_style_language(raw, mi)
read_default_style_language(raw, mi, self.namespace.XPath)
ap_name = self.relationships.get(APPPROPS, None)
ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if ap_name:
try:
raw = self.read(ap_name)

View File

@ -9,7 +9,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.docx.index import process_index, polish_index_markup
from calibre.ebooks.docx.names import XPath, get, namespaces
class Field(object):
@ -48,9 +47,6 @@ scanner = re.Scanner([
null = object()
def WORD(x):
return '{%s}%s' % (namespaces['w'], x)
def parser(name, field_map, default_field_name=None):
field_map = dict((x.split(':') for x in field_map.split()))
@ -98,22 +94,23 @@ parse_noteref = parser('noteref',
class Fields(object):
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.fields = []
self.index_bookmark_counter = 0
self.index_bookmark_prefix = 'index-'
def __call__(self, doc, log):
all_ids = frozenset(XPath('//*/@w:id')(doc))
all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
c = 0
while self.index_bookmark_prefix in all_ids:
c += 1
self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
stack = []
for elem in XPath(
for elem in self.namespace.XPath(
'//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
if elem.tag.endswith('}fldChar'):
typ = get(elem, 'w:fldCharType')
typ = self.namespace.get(elem, 'w:fldCharType')
if typ == 'begin':
stack.append(Field(elem))
self.fields.append(stack[-1])
@ -193,6 +190,8 @@ class Fields(object):
if xe:
# We insert a synthetic bookmark around this index item so that we
# can link to it later
def WORD(x):
return self.namespace.expand('w:' + x)
self.index_bookmark_counter += 1
bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
p = field.start.getparent()
@ -210,7 +209,7 @@ class Fields(object):
if not field.contents:
return
idx = parse_func(field.instructions, log)
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log)
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
if not blocks:
return
for anchor, run in hyperlinks:

View File

@ -10,7 +10,6 @@ import os, re
from collections import namedtuple
from calibre.ebooks.docx.block_styles import binary_property, inherit
from calibre.ebooks.docx.names import XPath, get
from calibre.utils.filenames import ascii_filename
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
@ -29,7 +28,7 @@ def get_variant(bold=False, italic=False):
class Family(object):
def __init__(self, elem, embed_relationships):
def __init__(self, elem, embed_relationships, XPath, get):
self.name = self.family_name = get(elem, 'w:name')
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
if self.alt_names and not has_system_fonts(self.name):
@ -51,7 +50,7 @@ class Family(object):
for x in XPath('./w:family[@w:val]')(elem):
self.generic_family = get(x, 'w:val', 'auto')
ntt = binary_property(elem, 'notTrueType')
ntt = binary_property(elem, 'notTrueType', XPath, get)
self.is_ttf = ntt is inherit or not ntt
self.panose1 = None
@ -73,13 +72,14 @@ class Family(object):
class Fonts(object):
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.fonts = {}
self.used = set()
def __call__(self, root, embed_relationships, docx, dest_dir):
for elem in XPath('//w:font[@w:name]')(root):
self.fonts[get(elem, 'w:name')] = Family(elem, embed_relationships)
for elem in self.namespace.XPath('//w:font[@w:name]')(root):
self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
def family_for(self, name, bold=False, italic=False):
f = self.fonts.get(name, None)

View File

@ -8,42 +8,43 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import get, XPath, descendants
class Note(object):
def __init__(self, parent, rels):
self.type = get(parent, 'w:type', 'normal')
def __init__(self, namespace, parent, rels):
self.type = namespace.get(parent, 'w:type', 'normal')
self.parent = parent
self.rels = rels
self.namespace = namespace
def __iter__(self):
for p in descendants(self.parent, 'w:p', 'w:tbl'):
for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
yield p
class Footnotes(object):
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.footnotes = {}
self.endnotes = {}
self.counter = 0
self.notes = OrderedDict()
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
XPath, get = self.namespace.XPath, self.namespace.get
if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id')
if fid:
self.footnotes[fid] = Note(footnote, footnotes_rels)
self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id')
if fid:
self.endnotes[fid] = Note(endnote, endnotes_rels)
self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
def get_ref(self, ref):
fid = get(ref, 'w:id')
fid = self.namespace.get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None)
if note is not None and note.type == 'normal':

View File

@ -11,7 +11,7 @@ import os
from lxml.html.builder import IMG, HR
from calibre.constants import iswindows
from calibre.ebooks.docx.names import XPath, get, barename
from calibre.ebooks.docx.names import barename
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
@ -27,7 +27,7 @@ def emu_to_pt(x):
def pt_to_emu(x):
return int(x * 12700)
def get_image_properties(parent):
def get_image_properties(parent, XPath, get):
width = height = None
for extent in XPath('./wp:extent')(parent):
try:
@ -67,7 +67,7 @@ def get_image_margins(elem):
ans['padding-%s' % css] = '%.3gpt' % val
return ans
def get_hpos(anchor, page_width):
def get_hpos(anchor, page_width, XPath, get):
for ph in XPath('./wp:positionH')(anchor):
rp = ph.get('relativeFrom', None)
if rp == 'leftMargin':
@ -101,7 +101,8 @@ def get_hpos(anchor, page_width):
class Images(object):
def __init__(self, log):
def __init__(self, namespace, log):
self.namespace = namespace
self.rid_map = {}
self.used = {}
self.names = set()
@ -158,6 +159,7 @@ class Images(object):
return name
def pic_to_img(self, pic, alt, parent):
XPath, get = self.namespace.XPath, self.namespace.get
name = None
link = None
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
@ -191,9 +193,10 @@ class Images(object):
return img
def drawing_to_html(self, drawing, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First process the inline pictures
for inline in XPath('./wp:inline')(drawing):
style, alt = get_image_properties(inline)
style, alt = get_image_properties(inline, XPath, get)
for pic in XPath('descendant::pic:pic')(inline):
ans = self.pic_to_img(pic, alt, inline)
if ans is not None:
@ -203,7 +206,7 @@ class Images(object):
# Now process the floats
for anchor in XPath('./wp:anchor')(drawing):
style, alt = get_image_properties(anchor)
style, alt = get_image_properties(anchor, XPath, get)
self.get_float_properties(anchor, style, page)
for pic in XPath('descendant::pic:pic')(anchor):
ans = self.pic_to_img(pic, alt, anchor)
@ -213,6 +216,7 @@ class Images(object):
yield ans
def pict_to_html(self, pict, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First see if we have an <hr>
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
if is_hr:
@ -247,6 +251,7 @@ class Images(object):
yield img
def get_float_properties(self, anchor, style, page):
XPath, get = self.namespace.XPath, self.namespace.get
if 'display' not in style:
style['display'] = 'block'
padding = get_image_margins(anchor)
@ -257,7 +262,7 @@ class Images(object):
# Ignore margins
page_width = page.width
hpos = get_hpos(anchor, page_width) + width/(2*page_width)
hpos = get_hpos(anchor, page_width, XPath, get) + width/(2*page_width)
wrap_elem = None
dofloat = False

View File

@ -10,10 +10,9 @@ from operator import itemgetter
from lxml import etree
from calibre.ebooks.docx.names import XPath, expand
from calibre.utils.icu import partition_by_first_letter, sort_key
def get_applicable_xe_fields(index, xe_fields):
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
iet = index.get('entry-type', None)
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
@ -40,7 +39,7 @@ def get_applicable_xe_fields(index, xe_fields):
return [xe for xe in xe_fields if contained(xe)]
def make_block(style, parent, pos):
def make_block(expand, style, parent, pos):
p = parent.makeelement(expand('w:p'))
parent.insert(pos, p)
if style is not None:
@ -56,7 +55,7 @@ def make_block(style, parent, pos):
r.append(t)
return p, t
def add_xe(xe, t):
def add_xe(xe, t, expand):
text = xe.get('text', '')
pt = xe.get('page-number-text', None)
t.text = text or ' '
@ -70,7 +69,7 @@ def add_xe(xe, t):
r.append(t2)
return xe['anchor'], t.getparent()
def process_index(field, index, xe_fields, log):
def process_index(field, index, xe_fields, log, XPath, expand):
'''
We remove all the word generated index markup and replace it with our own
that is more suitable for an ebook.
@ -89,7 +88,7 @@ def process_index(field, index, xe_fields, log):
start_pos = (p, p.index(elem))
p.remove(elem)
xe_fields = get_applicable_xe_fields(index, xe_fields)
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
if not xe_fields:
return [], []
if heading_text is not None:
@ -107,14 +106,14 @@ def process_index(field, index, xe_fields, log):
for item in reversed(items):
is_heading = not isinstance(item, dict)
style = heading_style if is_heading else None
p, t = make_block(style, *start_pos)
p, t = make_block(expand, style, *start_pos)
if is_heading:
text = heading_text
if text.lower().startswith('a'):
text = item + text[1:]
t.text = text
else:
hyperlinks.append(add_xe(item, t))
hyperlinks.append(add_xe(item, t, expand))
blocks.append(p)
return hyperlinks, blocks

View File

@ -12,22 +12,25 @@ from lxml.etree import XPath as X
from calibre.utils.filenames import ascii_text
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
EMBEDDED_FONT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font'
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings'
WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings'
# Names {{{
TRANSITIONAL_NAMES = {
'DOCUMENT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
'DOCPROPS' : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
'APPPROPS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
'STYLES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
'FONTS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
'IMAGES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
'LINKS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
'ENDNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
'THEMES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
'SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
}
namespaces = {
TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
@ -57,40 +60,13 @@ namespaces = {
'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/'
}
xpath_cache = {}
def XPath(expr):
ans = xpath_cache.get(expr, None)
if ans is None:
xpath_cache[expr] = ans = X(expr, namespaces=namespaces)
return ans
def is_tag(x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (namespaces.get(ns, None), name) == tag
# }}}
def barename(x):
return x.rpartition('}')[-1]
def XML(x):
return '{%s}%s' % (namespaces['xml'], x)
def expand(name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (namespaces[ns], tag)
return tag or ns
def get(x, attr, default=None):
return x.attrib.get(expand(attr), default)
def ancestor(elem, name):
try:
return XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
def generate_anchor(name, existing):
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
@ -100,14 +76,48 @@ def generate_anchor(name, existing):
c += 1
return y
def children(elem, *args):
return XPath('|'.join('child::%s' % a for a in args))(elem)
class DOCXNamespace(object):
def descendants(elem, *args):
return XPath('|'.join('descendant::%s' % a for a in args))(elem)
def __init__(self, transitional=True):
self.xpath_cache = {}
if transitional:
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
self.names = TRANSITIONAL_NAMES.copy()
def makeelement(root, tag, append=True, **attrs):
ans = root.makeelement(expand(tag), **{expand(k, sep='_'):v for k, v in attrs.iteritems()})
def XPath(self, expr):
ans = self.xpath_cache.get(expr, None)
if ans is None:
self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
return ans
def is_tag(self, x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
def expand(self, name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (self.namespaces[ns], tag)
return tag or ns
def get(self, x, attr, default=None):
return x.attrib.get(self.expand(attr), default)
def ancestor(self, elem, name):
try:
return self.XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def children(self, elem, *args):
return self.XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(self, elem, *args):
return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
def makeelement(self, root, tag, append=True, **attrs):
ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in attrs.iteritems()})
if append:
root.append(ans)
return ans

View File

@ -13,7 +13,6 @@ from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle, inherit
from calibre.ebooks.docx.names import XPath, get
STYLE_MAP = {
'aiueo': 'hiragana',
@ -32,7 +31,8 @@ STYLE_MAP = {
class Level(object):
def __init__(self, lvl=None):
def __init__(self, namespace, lvl=None):
self.namespace = namespace
self.restart = None
self.start = 0
self.fmt = 'decimal'
@ -47,7 +47,7 @@ class Level(object):
self.read_from_xml(lvl)
def copy(self):
ans = Level()
ans = Level(self.namespace)
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
setattr(ans, x, getattr(self, x))
return ans
@ -61,6 +61,7 @@ class Level(object):
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False):
XPath, get = self.namespace.XPath, self.namespace.get
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try:
self.restart = int(get(lr, 'w:val'))
@ -74,7 +75,7 @@ class Level(object):
pass
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(rPr)
ps = RunStyle(self.namespace, rPr)
if self.character_style is None:
self.character_style = ps
else:
@ -106,7 +107,7 @@ class Level(object):
self.para_link = get(lr, 'w:val')
for pPr in XPath('./w:pPr')(lvl):
ps = ParagraphStyle(pPr)
ps = ParagraphStyle(self.namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
@ -135,7 +136,9 @@ class Level(object):
class NumberingDefinition(object):
def __init__(self, parent=None, an_id=None):
def __init__(self, namespace, parent=None, an_id=None):
self.namespace = namespace
XPath, get = self.namespace.XPath, self.namespace.get
self.levels = {}
self.abstract_numbering_definition_id = an_id
if parent is not None:
@ -144,17 +147,18 @@ class NumberingDefinition(object):
ilvl = int(get(lvl, 'w:ilvl', 0))
except (TypeError, ValueError):
ilvl = 0
self.levels[ilvl] = Level(lvl)
self.levels[ilvl] = Level(namespace, lvl)
def copy(self):
ans = NumberingDefinition(an_id=self.abstract_numbering_definition_id)
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
for l, lvl in self.levels.iteritems():
ans.levels[l] = lvl.copy()
return ans
class Numbering(object):
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.definitions = {}
self.instances = {}
self.counters = defaultdict(Counter)
@ -163,6 +167,7 @@ class Numbering(object):
def __call__(self, root, styles, rid_map):
' Read all numbering style definitions '
XPath, get = self.namespace.XPath, self.namespace.get
self.rid_map = rid_map
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
npbid = get(npb, 'w:numPicBulletId')
@ -176,7 +181,7 @@ class Numbering(object):
if nsl:
lazy_load[an_id] = get(nsl[0], 'w:val')
else:
nd = NumberingDefinition(an, an_id=an_id)
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
self.definitions[an_id] = nd
def create_instance(n, definition):
@ -199,7 +204,7 @@ class Numbering(object):
ilvl = nilvl if ilvl is None else ilvl
alvl = nd.levels.get(ilvl, None)
if alvl is None:
alvl = Level()
alvl = Level(self.namespace)
alvl.read_from_xml(lvl, override=True)
for ilvl, so in start_overrides.iteritems():
try:

View File

@ -6,17 +6,16 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath, get
class Settings(object):
def __init__(self):
def __init__(self, namespace):
self.default_tab_stop = 720 / 20
self.namespace = namespace
def __call__(self, root):
for dts in XPath('//w:defaultTabStop[@w:val]')(root):
for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
try:
self.default_tab_stop = int(get(dts, 'w:val')) / 20
self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
except (ValueError, TypeError, AttributeError):
pass

View File

@ -12,7 +12,6 @@ from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
from calibre.ebooks.docx.names import XPath, get
class PageProperties(object):
@ -21,12 +20,12 @@ class PageProperties(object):
sectPr elements.
'''
def __init__(self, elems=()):
def __init__(self, namespace, elems=()):
self.width = self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts
for sectPr in elems:
for pgSz in XPath('./w:pgSz')(sectPr):
w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h')
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
try:
self.width = int(w)/20
except (ValueError, TypeError):
@ -35,8 +34,8 @@ class PageProperties(object):
self.height = int(h)/20
except (ValueError, TypeError):
pass
for pgMar in XPath('./w:pgMar')(sectPr):
l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right')
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
try:
self.margin_left = int(l)/20
except (ValueError, TypeError):
@ -52,41 +51,41 @@ class Style(object):
Class representing a <w:style> element. Can contain block, character, etc. styles.
'''
name_path = XPath('./w:name[@w:val]')
based_on_path = XPath('./w:basedOn[@w:val]')
def __init__(self, elem):
def __init__(self, namespace, elem):
self.namespace = namespace
self.name_path = namespace.XPath('./w:name[@w:val]')
self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
self.resolved = False
self.style_id = get(elem, 'w:styleId')
self.style_type = get(elem, 'w:type')
self.style_id = namespace.get(elem, 'w:styleId')
self.style_type = namespace.get(elem, 'w:type')
names = self.name_path(elem)
self.name = get(names[-1], 'w:val') if names else None
self.name = namespace.get(names[-1], 'w:val') if names else None
based_on = self.based_on_path(elem)
self.based_on = get(based_on[0], 'w:val') if based_on else None
self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering':
self.based_on = None
self.is_default = get(elem, 'w:default') in {'1', 'on', 'true'}
self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table':
for tblPr in XPath('./w:tblPr')(elem):
ts = TableStyle(tblPr)
for tblPr in namespace.XPath('./w:tblPr')(elem):
ts = TableStyle(namespace, tblPr)
if self.table_style is None:
self.table_style = ts
else:
self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}:
for pPr in XPath('./w:pPr')(elem):
ps = ParagraphStyle(pPr)
for pPr in namespace.XPath('./w:pPr')(elem):
ps = ParagraphStyle(namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
for rPr in XPath('./w:rPr')(elem):
rs = RunStyle(rPr)
for rPr in namespace.XPath('./w:rPr')(elem):
rs = RunStyle(namespace, rPr)
if self.character_style is None:
self.character_style = rs
else:
@ -94,21 +93,21 @@ class Style(object):
if self.style_type in {'numbering', 'paragraph'}:
self.numbering_style_link = None
for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = get(x, 'w:val')
for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = namespace.get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.table_style is not None:
if self.table_style is None:
self.table_style = TableStyle()
self.table_style = TableStyle(self.namespace)
self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle()
self.paragraph_style = ParagraphStyle(self.namespace)
self.paragraph_style.resolve_based_on(parent.paragraph_style)
if parent.character_style is not None:
if self.character_style is None:
self.character_style = RunStyle()
self.character_style = RunStyle(self.namespace)
self.character_style.resolve_based_on(parent.character_style)
@ -118,7 +117,8 @@ class Styles(object):
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
'''
def __init__(self, tables):
def __init__(self, namespace, tables):
self.namespace = namespace
self.id_map = OrderedDict()
self.para_cache = {}
self.para_char_cache = {}
@ -144,8 +144,8 @@ class Styles(object):
def __call__(self, root, fonts, theme):
self.fonts, self.theme = fonts, theme
for s in XPath('//w:style')(root):
s = Style(s)
for s in self.namespace.XPath('//w:style')(root):
s = Style(self.namespace, s)
if s.style_id:
self.id_map[s.style_id] = s
if s.is_default:
@ -155,17 +155,17 @@ class Styles(object):
self.default_paragraph_style = self.default_character_style = None
for dd in XPath('./w:docDefaults')(root):
for pd in XPath('./w:pPrDefault')(dd):
for pPr in XPath('./w:pPr')(pd):
ps = ParagraphStyle(pPr)
for dd in self.namespace.XPath('./w:docDefaults')(root):
for pd in self.namespace.XPath('./w:pPrDefault')(dd):
for pPr in self.namespace.XPath('./w:pPr')(pd):
ps = ParagraphStyle(self.namespace, pPr)
if self.default_paragraph_style is None:
self.default_paragraph_style = ps
else:
self.default_paragraph_style.update(ps)
for pd in XPath('./w:rPrDefault')(dd):
for pPr in XPath('./w:rPr')(pd):
ps = RunStyle(pPr)
for pd in self.namespace.XPath('./w:rPrDefault')(dd):
for pPr in self.namespace.XPath('./w:rPr')(pd):
ps = RunStyle(self.namespace, pPr)
if self.default_character_style is None:
self.default_character_style = ps
else:
@ -213,18 +213,18 @@ class Styles(object):
ans = self.para_cache.get(p, None)
if ans is None:
linked_style = None
ans = self.para_cache[p] = ParagraphStyle()
ans = self.para_cache[p] = ParagraphStyle(self.namespace)
ans.style_name = None
direct_formatting = None
for pPr in XPath('./w:pPr')(p):
ps = ParagraphStyle(pPr)
for pPr in self.namespace.XPath('./w:pPr')(p):
ps = ParagraphStyle(self.namespace, pPr)
if direct_formatting is None:
direct_formatting = ps
else:
direct_formatting.update(ps)
if direct_formatting is None:
direct_formatting = ParagraphStyle()
direct_formatting = ParagraphStyle(self.namespace)
parent_styles = []
if self.default_paragraph_style is not None:
parent_styles.append(self.default_paragraph_style)
@ -275,19 +275,19 @@ class Styles(object):
def resolve_run(self, r):
ans = self.run_cache.get(r, None)
if ans is None:
p = XPath('ancestor::w:p[1]')(r)
p = self.namespace.XPath('ancestor::w:p[1]')(r)
p = p[0] if p else None
ans = self.run_cache[r] = RunStyle()
ans = self.run_cache[r] = RunStyle(self.namespace)
direct_formatting = None
for rPr in XPath('./w:rPr')(r):
rs = RunStyle(rPr)
for rPr in self.namespace.XPath('./w:rPr')(r):
rs = RunStyle(self.namespace, rPr)
if direct_formatting is None:
direct_formatting = rs
else:
direct_formatting.update(rs)
if direct_formatting is None:
direct_formatting = RunStyle()
direct_formatting = RunStyle(self.namespace)
parent_styles = []
default_char = self.default_styles.get('character', None)
@ -484,5 +484,3 @@ class Styles(object):
b = '\n'.join(b)
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return prefix + '\n' + '\n'.join(ans)

View File

@ -10,13 +10,12 @@ from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get, is_tag
# Read from XML {{{
read_shd = rs
edges = ('left', 'top', 'right', 'bottom')
def _read_width(elem):
def _read_width(elem, get):
ans = inherit
try:
w = int(get(elem, 'w:w'))
@ -33,29 +32,29 @@ def _read_width(elem):
ans = '%.3g%%' % (w/50)
return ans
def read_width(parent, dest):
def read_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW)
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_cell_width(parent, dest):
def read_cell_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tcW')(parent):
ans = _read_width(tblW)
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_padding(parent, dest):
def read_padding(parent, dest, XPath, get):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
ans = {x:inherit for x in edges}
for mar in XPath('./w:%s' % name)(parent):
for x in edges:
for edge in XPath('./w:%s' % x)(mar):
ans[x] = _read_width(edge)
ans[x] = _read_width(edge, get)
for x in edges:
setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_justification(parent, dest):
def read_justification(parent, dest, XPath, get):
left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
@ -70,31 +69,31 @@ def read_justification(parent, dest):
setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right)
def read_spacing(parent, dest):
def read_spacing(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs)
ans = _read_width(cs, get)
setattr(dest, 'spacing', ans)
def read_float(parent, dest):
def read_float(parent, dest, XPath, get):
ans = inherit
for x in XPath('./w:tblpPr')(parent):
ans = {k.rpartition('}')[-1]: v for k, v in x.attrib.iteritems()}
setattr(dest, 'float', ans)
def read_indent(parent, dest):
def read_indent(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs)
ans = _read_width(cs, get)
setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest):
def read_borders(parent, dest, XPath, get):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, border_edges, name)
read_border(parent, dest, XPath, get, border_edges, name)
def read_height(parent, dest):
def read_height(parent, dest, XPath, get):
ans = inherit
for rh in XPath('./w:trHeight')(parent):
rule = get(rh, 'w:hRule', 'auto')
@ -103,14 +102,14 @@ def read_height(parent, dest):
ans = (rule, val)
setattr(dest, 'height', ans)
def read_vertical_align(parent, dest):
def read_vertical_align(parent, dest, XPath, get):
ans = inherit
for va in XPath('./w:vAlign')(parent):
val = get(va, 'w:val')
ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
setattr(dest, 'vertical_align', ans)
def read_col_span(parent, dest):
def read_col_span(parent, dest, XPath, get):
ans = inherit
for gs in XPath('./w:gridSpan')(parent):
try:
@ -119,14 +118,14 @@ def read_col_span(parent, dest):
continue
setattr(dest, 'col_span', ans)
def read_merge(parent, dest):
def read_merge(parent, dest, XPath, get):
for x in ('hMerge', 'vMerge'):
ans = inherit
for m in XPath('./w:%s' % x)(parent):
ans = get(m, 'w:val', 'continue')
setattr(dest, x, ans)
def read_band_size(parent, dest):
def read_band_size(parent, dest, XPath, get):
for x in ('Col', 'Row'):
ans = 1
for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
@ -136,7 +135,7 @@ def read_band_size(parent, dest):
continue
setattr(dest, '%s_band_size' % x.lower(), ans)
def read_look(parent, dest):
def read_look(parent, dest, XPath, get):
ans = 0
for x in XPath('./w:tblLook')(parent):
try:
@ -148,8 +147,10 @@ def read_look(parent, dest):
# }}}
def clone(style):
if style is None:
return None
try:
ans = type(style)()
ans = type(style)(style.namespace)
except TypeError:
return None
ans.update(style)
@ -190,16 +191,17 @@ class RowStyle(Style):
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
def __init__(self, trPr=None):
def __init__(self, namespace, trPr=None):
self.namespace = namespace
if trPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in ('hidden', 'cantSplit'):
setattr(self, p, binary_property(trPr, p))
setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
for p in ('spacing', 'height'):
f = globals()['read_%s' % p]
f(trPr, self)
f(trPr, self, namespace.XPath, namespace.get)
self._css = None
@property
@ -226,14 +228,15 @@ class CellStyle(Style):
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tcPr=None):
def __init__(self, namespace, tcPr=None):
self.namespace = namespace
if tcPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
f = globals()['read_%s' % x]
f(tcPr, self)
f(tcPr, self, namespace.XPath, namespace.get)
self.row_span = inherit
self._css = None
@ -270,7 +273,8 @@ class TableStyle(Style):
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tblPr=None):
def __init__(self, namespace, tblPr=None):
self.namespace = namespace
if tblPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
@ -278,23 +282,23 @@ class TableStyle(Style):
self.overrides = inherit
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
f = globals()['read_%s' % x]
f(tblPr, self)
f(tblPr, self, self.namespace.XPath, self.namespace.get)
parent = tblPr.getparent()
if is_tag(parent, 'w:style'):
if self.namespace.is_tag(parent, 'w:style'):
self.overrides = {}
for tblStylePr in XPath('./w:tblStylePr[@w:type]')(parent):
otype = get(tblStylePr, 'w:type')
for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
otype = self.namespace.get(tblStylePr, 'w:type')
orides = self.overrides[otype] = {}
for tblPr in XPath('./w:tblPr')(tblStylePr):
orides['table'] = TableStyle(tblPr)
for trPr in XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(trPr)
for tcPr in XPath('./w:tcPr')(tblStylePr):
orides['cell'] = CellStyle(tcPr)
for pPr in XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(pPr)
for rPr in XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(rPr)
for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
orides['table'] = TableStyle(self.namespace, tblPr)
for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(self.namespace, trPr)
for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
orides['cell'] = CellStyle(self.namespace, tcPr)
for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(self.namespace, pPr)
for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(self.namespace, rPr)
self._css = None
def resolve_based_on(self, parent):
@ -343,16 +347,17 @@ class TableStyle(Style):
class Table(object):
def __init__(self, tbl, styles, para_map, is_sub_table=False):
def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
self.namespace = namespace
self.tbl = tbl
self.styles = styles
self.is_sub_table = is_sub_table
# Read Table Style
style = {'table':TableStyle()}
for tblPr in XPath('./w:tblPr')(tbl):
for ts in XPath('./w:tblStyle[@w:val]')(tblPr):
style_id = get(ts, 'w:val')
style = {'table':TableStyle(self.namespace)}
for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
style_id = self.namespace.get(ts, 'w:val')
s = styles.get(style_id)
if s is not None:
if s.table_style is not None:
@ -367,7 +372,7 @@ class Table(object):
style['run'].update(s.character_style)
else:
style['run'] = s.character_style
style['table'].update(TableStyle(tblPr))
style['table'].update(TableStyle(self.namespace, tblPr))
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
self.run_style = style.get('run', None)
self.overrides = self.table_style.overrides
@ -380,23 +385,23 @@ class Table(object):
self.paragraphs = []
self.cell_map = []
rows = XPath('./w:tr')(tbl)
rows = self.namespace.XPath('./w:tr')(tbl)
for r, tr in enumerate(rows):
overrides = self.get_overrides(r, None, len(rows), None)
self.resolve_row_style(tr, overrides)
cells = XPath('./w:tc')(tr)
cells = self.namespace.XPath('./w:tc')(tr)
self.cell_map.append([])
for c, tc in enumerate(cells):
overrides = self.get_overrides(r, c, len(rows), len(cells))
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
self.cell_map[-1].append(tc)
for p in XPath('./w:p')(tc):
for p in self.namespace.XPath('./w:p')(tc):
para_map[p] = self
self.paragraphs.append(p)
self.resolve_para_style(p, overrides)
self.handle_merged_cells()
self.sub_tables = {x:Table(x, styles, para_map, is_sub_table=True) for x in XPath('./w:tr/w:tc/w:tbl')(tbl)}
self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
def override_allowed(self, name):
'Check if the named override is allowed by the tblLook element'
@ -449,7 +454,7 @@ class Table(object):
return tuple(filter(self.override_allowed, overrides))
def resolve_row_style(self, tr, overrides):
rs = RowStyle()
rs = RowStyle(self.namespace)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
@ -457,12 +462,12 @@ class Table(object):
if ors is not None:
rs.update(ors)
for trPr in XPath('./w:trPr')(tr):
rs.update(RowStyle(trPr))
for trPr in self.namespace.XPath('./w:trPr')(tr):
rs.update(RowStyle(self.namespace, trPr))
self.style_map[tr] = rs
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
cs = CellStyle()
cs = CellStyle(self.namespace)
# from lxml.etree import tostring
# txt = tostring(tc, method='text', encoding=unicode)
for o in overrides:
@ -472,8 +477,8 @@ class Table(object):
if ors is not None:
cs.update(ors)
for tcPr in XPath('./w:tcPr')(tc):
cs.update(CellStyle(tcPr))
for tcPr in self.namespace.XPath('./w:tcPr')(tc):
cs.update(CellStyle(self.namespace, tcPr))
for x in edges:
p = 'cell_padding_%s' % x
@ -535,7 +540,7 @@ class Table(object):
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle()
s = CellStyle(self.namespace)
if s.vMerge == 'restart':
runs.append([cell])
elif s.vMerge == 'continue':
@ -555,7 +560,7 @@ class Table(object):
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle()
s = CellStyle(self.namespace)
if s.col_span is not inherit:
runs.append([])
continue
@ -593,12 +598,12 @@ class Table(object):
parent.insert(idx, table)
else:
parent.append(table)
for row in XPath('./w:tr')(self.tbl):
for row in self.namespace.XPath('./w:tr')(self.tbl):
tr = TR('\n\t\t\t')
style_map[tr] = self.style_map[row]
tr.tail = '\n\t\t'
table.append(tr)
for tc in XPath('./w:tc')(row):
for tc in self.namespace.XPath('./w:tc')(row):
td = TD()
style_map[td] = s = self.style_map[tc]
if s.col_span is not inherit:
@ -607,7 +612,7 @@ class Table(object):
td.set('rowspan', type('')(s.row_span))
td.tail = '\n\t\t\t'
tr.append(td)
for x in XPath('./w:p|./w:tbl')(tc):
for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
if x.tag.endswith('}p'):
td.append(rmap[x])
else:
@ -627,15 +632,16 @@ class Table(object):
class Tables(object):
def __init__(self):
def __init__(self, namespace):
self.tables = []
self.para_map = {}
self.sub_tables = set()
self.namespace = namespace
def register(self, tbl, styles):
if tbl in self.sub_tables:
return
self.tables.append(Table(tbl, styles, self.para_map))
self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
self.sub_tables |= set(self.tables[-1].sub_tables)
def apply_markup(self, object_map, page_map):

View File

@ -6,22 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath
class Theme(object):
def __init__(self):
def __init__(self, namespace):
self.major_latin_font = 'Cambria'
self.minor_latin_font = 'Calibri'
self.namespace = namespace
def __call__(self, root):
for fs in XPath('//a:fontScheme')(root):
for mj in XPath('./a:majorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj):
for fs in self.namespace.XPath('//a:fontScheme')(root):
for mj in self.namespace.XPath('./a:majorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.major_latin_font = l.get('typeface')
for mj in XPath('./a:minorFont')(fs):
for l in XPath('./a:latin[@typeface]')(mj):
for mj in self.namespace.XPath('./a:minorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.minor_latin_font = l.get('typeface')
def resolve_font_family(self, ff):

View File

@ -15,9 +15,7 @@ from lxml.html.builder import (
from calibre import guess_type
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
from calibre.ebooks.docx.names import XML, generate_anchor
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
@ -54,6 +52,7 @@ class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
@ -62,12 +61,12 @@ class Convert(object):
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
self.body = BODY()
self.theme = Theme()
self.settings = Settings()
self.tables = Tables()
self.fields = Fields()
self.styles = Styles(self.tables)
self.images = Images(self.log)
self.theme = Theme(self.namespace)
self.settings = Settings(self.namespace)
self.tables = Tables(self.namespace)
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
@ -211,7 +210,7 @@ class Convert(object):
html_obj.set('class', cls)
if notes_header is not None:
for h in children(self.body, 'h1', 'h2', 'h3'):
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag
cls = h.get('class', None)
if cls and cls != 'notes-header':
@ -221,7 +220,7 @@ class Convert(object):
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
return self.write(doc)
@ -230,14 +229,14 @@ class Convert(object):
self.page_map = OrderedDict()
self.section_starts = []
for p in descendants(doc, 'w:p', 'w:tbl'):
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles)
current.append(p)
continue
sect = tuple(descendants(p, 'w:sectPr'))
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(sect)
pr = PageProperties(self.namespace, sect)
paras = current + [p]
for x in paras:
self.page_map[x] = pr
@ -248,8 +247,8 @@ class Convert(object):
if current:
self.section_starts.append(current[0])
last = XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(last)
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(self.namespace, last)
for x in current:
self.page_map[x] = pr
@ -264,16 +263,16 @@ class Convert(object):
name = name
return name
nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml')
sename = get_name(SETTINGS, 'settings.xml')
fname = get_name(FONTS, 'fontTable.xml')
tname = get_name(THEMES, 'theme1.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml')
enname = get_name(ENDNOTES, 'endnotes.xml')
numbering = self.numbering = Numbering()
footnotes = self.footnotes = Footnotes()
fonts = self.fonts = Fonts()
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
numbering = self.numbering = Numbering(self.namespace)
footnotes = self.footnotes = Footnotes(self.namespace)
fonts = self.fonts = Fonts(self.namespace)
foraw = enraw = None
forel, enrel = ({}, {}), ({}, {})
@ -337,7 +336,7 @@ class Convert(object):
self.styles.resolve_numbering(numbering)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log)
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
@ -363,11 +362,11 @@ class Convert(object):
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors:
current_bm = set()
rmap = {v:k for k, v in self.object_map.iteritems()}
for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
@ -377,7 +376,7 @@ class Convert(object):
self.anchor_map[name] = para.get('id')
current_bm = set()
elif p in doc_anchors:
anchor = get(p, 'w:name')
anchor = self.namespace.get(p, 'w:name')
if anchor:
current_bm.add(anchor)
@ -390,7 +389,7 @@ class Convert(object):
current_anchor = None
current_hyperlink = None
hl_xpath = XPath('ancestor::w:hyperlink[1]')
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
def p_parent(x):
# Ensure that nested <w:p> tags are handled. These can occur if a
@ -403,7 +402,7 @@ class Convert(object):
except AttributeError:
break
for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
@ -422,7 +421,7 @@ class Convert(object):
dest.append(span)
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = get(x, 'w:name')
anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map:
old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
@ -502,17 +501,17 @@ class Convert(object):
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = get(hyperlink, 'w:tgtFrame')
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)
tt = get(hyperlink, 'w:tooltip')
tt = self.namespace.get(hyperlink, 'w:tooltip')
if tt:
span.set('title', tt)
rid = get(hyperlink, 'r:id')
rid = self.namespace.get(hyperlink, 'r:id')
if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid])
continue
anchor = get(hyperlink, 'w:anchor')
anchor = self.namespace.get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
@ -576,7 +575,7 @@ class Convert(object):
text = Text(ans, 'text', [])
for child in run:
if is_tag(child, 'w:t'):
if self.namespace.is_tag(child, 'w:t'):
if not child.text:
continue
space = child.get(XML('space'), None)
@ -596,11 +595,11 @@ class Convert(object):
ans.append(text.elem)
else:
text.buf.append(ctext)
elif is_tag(child, 'w:cr'):
elif self.namespace.is_tag(child, 'w:cr'):
text.add_elem(BR())
ans.append(text.elem)
elif is_tag(child, 'w:br'):
typ = get(child, 'w:type')
elif self.namespace.is_tag(child, 'w:br'):
typ = self.namespace.get(child, 'w:type')
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
@ -611,25 +610,25 @@ class Convert(object):
br = BR()
text.add_elem(br)
ans.append(text.elem)
elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
elif is_tag(child, 'w:tab'):
elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
elif is_tag(child, 'w:noBreakHyphen'):
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
text.buf.append(u'\u2011')
elif is_tag(child, 'w:softHyphen'):
elif self.namespace.is_tag(child, 'w:softHyphen'):
text.buf.append(u'\u00ad')
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))

View File

@ -10,7 +10,6 @@ from collections import namedtuple
from lxml.etree import tostring
from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -21,8 +20,9 @@ class Count(object):
def __init__(self):
self.val = 0
def from_headings(body, log):
def from_headings(body, log, namespace):
' Create a TOC from headings in the document '
XPath, descendants = namespace.XPath, namespace.descendants
headings = ('h1', 'h2', 'h3')
tocroot = TOC()
xpaths = [XPath('//%s' % x) for x in headings]
@ -99,7 +99,8 @@ def link_to_txt(a, styles, object_map):
return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
def from_toc(docx, link_map, styles, object_map, log):
def from_toc(docx, link_map, styles, object_map, log, namespace):
XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
toc_level = None
level = 0
TI = namedtuple('TI', 'text anchor indent')
@ -136,7 +137,5 @@ def from_toc(docx, link_map, styles, object_map, log):
log('Found Word Table of Contents, using it to generate the Table of Contents')
return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map, log):
return from_toc(docx, link_map, styles, object_map, log) or from_headings(body, log)
def create_toc(docx, body, link_map, styles, object_map, log, namespace):
return from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)

View File

@ -13,7 +13,7 @@ from lxml.builder import ElementMaker
from calibre import guess_type
from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES, FONTS
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.date import utcnow
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -27,7 +27,8 @@ def xml2str(root, pretty_print=False, with_tail=False):
pretty_print=pretty_print, with_tail=with_tail)
return ans
def create_skeleton(opts):
def create_skeleton(opts, namespaces=None):
namespaces = namespaces or DOCXNamespace().namespaces
def w(x):
return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
@ -70,9 +71,9 @@ def create_skeleton(opts):
return doc, styles, body
def update_doc_props(root, mi):
def update_doc_props(root, mi, namespace):
def setm(name, text=None, ns='dc'):
ans = root.makeelement('{%s}%s' % (namespaces[ns], name))
ans = root.makeelement('{%s}%s' % (namespace.namespaces[ns], name))
for child in tuple(root):
if child.tag == ans.tag:
root.remove(child)
@ -92,12 +93,13 @@ def update_doc_props(root, mi):
class DocumentRelationships(object):
def __init__(self):
def __init__(self, namespace):
self.rmap = {}
self.namespace = namespace
for typ, target in {
STYLES: 'styles.xml',
WEB_SETTINGS: 'webSettings.xml',
FONTS: 'fontTable.xml',
namespace.names['STYLES']: 'styles.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
namespace.names['FONTS']: 'fontTable.xml',
}.iteritems():
self.add_relationship(target, typ)
@ -112,9 +114,10 @@ class DocumentRelationships(object):
return ans
def add_image(self, target):
return self.add_relationship(target, IMAGES)
return self.add_relationship(target, self.namespace.names['IMAGES'])
def serialize(self):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.iteritems():
@ -127,8 +130,10 @@ class DocumentRelationships(object):
class DOCX(object):
def __init__(self, opts, log):
self.namespace = DOCXNamespace()
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships()
self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
self.embedded_fonts = E.Relationships()
@ -138,7 +143,7 @@ class DOCX(object):
# Boilerplate {{{
@property
def contenttypes(self):
E = ElementMaker(namespace=namespaces['ct'], nsmap={None:namespaces['ct']})
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
types = E.Types()
for partname, mt in {
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
@ -174,7 +179,7 @@ class DOCX(object):
@property
def appproperties(self):
E = ElementMaker(namespace=namespaces['ep'], nsmap={None:namespaces['ep']})
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
props = E.Properties(
E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]),
@ -193,14 +198,14 @@ class DOCX(object):
return textwrap.dedent(b'''\
<?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>''')
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names))
@property
def websettings(self):
E = ElementMaker(namespace=namespaces['w'], nsmap={'w':namespaces['w']})
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws)
@ -208,6 +213,7 @@ class DOCX(object):
# }}}
def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat(str('T')).rpartition('.')[0] + 'Z'
@ -216,7 +222,7 @@ class DOCX(object):
x.text = ts
cp.append(x)
self.mi = mi
update_doc_props(cp, self.mi)
update_doc_props(cp, self.mi, self.namespace)
return xml2str(cp)
def create_empty_document(self, mi):

View File

@ -9,7 +9,6 @@ __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from uuid import uuid4
from calibre.ebooks.docx.names import makeelement, EMBEDDED_FONT
from calibre.ebooks.oeb.base import OEB_STYLES
from calibre.ebooks.oeb.transforms.subset import find_font_face_rules
@ -21,10 +20,12 @@ def obfuscate_font_data(data, key):
class FontsManager(object):
def __init__(self, oeb, opts):
def __init__(self, namespace, oeb, opts):
self.namespace = namespace
self.oeb, self.log, self.opts = oeb, oeb.log, opts
def serialize(self, text_styles, fonts, embed_relationships, font_data_map):
makeelement = self.namespace.makeelement
font_families, seen = set(), set()
for ts in text_styles:
if ts.font_family:
@ -68,7 +69,7 @@ class FontsManager(object):
if rid is None:
rel_map[item] = rid = 'rId%d' % num
fname = 'fonts/font%d.odttf' % num
makeelement(embed_relationships, 'Relationship', Id=rid, Type=EMBEDDED_FONT, Target=fname)
makeelement(embed_relationships, 'Relationship', Id=rid, Type=self.namespace.names['EMBEDDED_FONT'], Target=fname)
font_data_map['word/' + fname] = obfuscate_font_data(item.data, key)
makeelement(font, 'w:embed' + tag, r_id=rid,
w_fontKey='{%s}' % key.urn.rpartition(':')[-1].upper(),

View File

@ -9,7 +9,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.docx.writer.container import create_skeleton
from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.docx.writer.styles import StylesManager
from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table
@ -45,12 +45,13 @@ class TextRun(object):
ws_pat = None
def __init__(self, style, first_html_parent):
def __init__(self, namespace, style, first_html_parent):
self.first_html_parent = first_html_parent
if self.ws_pat is None:
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
self.style = style
self.texts = []
self.makelement = namespace.makeelement
def add_text(self, text, preserve_whitespace):
if not preserve_whitespace:
@ -68,19 +69,18 @@ class TextRun(object):
self.texts.append((drawing, None))
def serialize(self, p):
r = p.makeelement(w('r'))
p.append(r)
rpr = r.makeelement(w('rPr'))
rpr.append(rpr.makeelement(w('rStyle'), **{w('val'):self.style.id}))
r.append(rpr)
makeelement = self.makelement
r = makeelement(p, 'w:r')
rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:rStyle', w_val=self.style.id)
for text, preserve_whitespace in self.texts:
if text is None:
r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace}))
makeelement(r, 'w:br', w_clear=preserve_whitespace)
elif hasattr(text, 'xpath'):
r.append(text)
else:
t = r.makeelement(w('t'))
r.append(t)
t = makeelement(r, 'w:t')
t.text = text or ''
if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
@ -94,7 +94,8 @@ class TextRun(object):
class Block(object):
def __init__(self, styles_manager, html_block, style, is_table_cell=False):
def __init__(self, namespace, styles_manager, html_block, style, is_table_cell=False):
self.namespace = namespace
self.html_block = html_block
self.html_style = style
self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell)
@ -109,7 +110,7 @@ class Block(object):
if self.runs and ts == self.runs[-1].style:
run = self.runs[-1]
else:
run = TextRun(ts, self.html_block if html_parent is None else html_parent)
run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent)
self.runs.append(run)
preserve_whitespace = ws in {'pre', 'pre-wrap'}
if ignore_leading_whitespace and not preserve_whitespace:
@ -125,7 +126,7 @@ class Block(object):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block)
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_break(clear=clear)
@ -133,20 +134,19 @@ class Block(object):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block)
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_image(drawing)
def serialize(self, body):
p = body.makeelement(w('p'))
body.append(p)
ppr = p.makeelement(w('pPr'))
p.append(ppr)
makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p')
ppr = makeelement(p, 'w:pPr')
if self.keep_next:
ppr.append(ppr.makeelement(w('keepNext')))
makeelement(ppr, 'w:keepNext')
if self.page_break_before:
ppr.append(ppr.makeelement(w('pageBreakBefore')))
ppr.append(ppr.makeelement(w('pStyle'), **{w('val'):self.style.id}))
makeelement(ppr, 'w:pageBreakBefore')
makeelement(ppr, 'w:pStyle', w_val=self.style.id)
for run in self.runs:
run.serialize(p)
@ -158,7 +158,8 @@ class Block(object):
class Blocks(object):
def __init__(self, styles_manager):
def __init__(self, namespace, styles_manager):
self.namespace = namespace
self.styles_manager = styles_manager
self.all_blocks = []
self.pos = 0
@ -183,12 +184,12 @@ class Blocks(object):
def start_new_block(self, html_block, style, is_table_cell=False):
self.end_current_block()
self.current_block = Block(self.styles_manager, html_block, style, is_table_cell=is_table_cell)
self.current_block = Block(self.namespace, self.styles_manager, html_block, style, is_table_cell=is_table_cell)
self.open_html_blocks.add(html_block)
return self.current_block
def start_new_table(self, html_tag, tag_style=None):
self.current_table = Table(html_tag, tag_style)
self.current_table = Table(self.namespace, html_tag, tag_style)
self.tables.append(self.current_table)
def start_new_row(self, html_tag, tag_style):
@ -252,10 +253,10 @@ class Convert(object):
self.svg_rasterizer = SVGRasterizer()
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager()
self.styles_manager = StylesManager(self.docx.namespace)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.fonts_manager = FontsManager(self.oeb, self.opts)
self.blocks = Blocks(self.styles_manager)
self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.docx.namespace, self.styles_manager)
for item in self.oeb.spine:
self.process_item(item)

View File

@ -15,7 +15,6 @@ from future_builtins import map
from lxml import etree
from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.docx.names import makeelement, namespaces
from calibre.ebooks.docx.images import pt_to_emu
from calibre.utils.filenames import ascii_filename
from calibre.utils.magick.draw import identify_data
@ -68,6 +67,8 @@ class ImagesManager(object):
name = urlunquote(posixpath.basename(href))
width, height = map(pt_to_emu, style.img_size(img.width, img.height))
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
root = etree.Element('root', nsmap=namespaces)
ans = makeelement(root, 'w:drawing', append=False)
if floating is None:

View File

@ -12,7 +12,6 @@ from operator import attrgetter
from lxml import etree
from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.icu import numeric_sort_key
from tinycss.css21 import CSS21Parser
@ -38,12 +37,6 @@ def css_font_family_to_docx(raw):
for ff in parse_css_font_family(raw):
return generic.get(ff.lower(), ff)
def w(x):
return '{%s}%s' % (namespaces['w'], x)
def makeelement(parent, name, **attrs):
return parent.makeelement(w(name), **{w(k):v for k, v in attrs.iteritems()})
def bmap(x):
return 'on' if x else 'off'
@ -52,12 +45,17 @@ class DOCXStyle(object):
ALL_PROPS = ()
TYPE = 'paragraph'
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x)
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
self.id = self.name = None
self.next_style = None
def makeelement(self, parent, name, **attrs):
return parent.makeelement(self.w(name), **{self.w(k):v for k, v in attrs.iteritems()})
def __hash__(self):
return self._hash
@ -71,10 +69,11 @@ class DOCXStyle(object):
return not self == other
def __repr__(self):
return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':namespaces['w']})), pretty_print=True)
return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True)
__str__ = __repr__
def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name))
if self is normal_style:
@ -106,7 +105,7 @@ class TextStyle(DOCXStyle):
'border_style', 'border_width', 'border_color')
TYPE = 'character'
def __init__(self, css, is_parent_style=False):
def __init__(self, namespace, css, is_parent_style=False):
self.font_family = css_font_family_to_docx(css['font-family'])
try:
self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts
@ -163,9 +162,10 @@ class TextStyle(DOCXStyle):
elif self.border_style != style:
self.border_style = ignore
DOCXStyle.__init__(self)
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
if (self.padding not in (None, ignore, 0) and self is normal_style) or self.padding != normal_style.padding:
bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding))
if (self.border_width not in (None, ignore, 0) and self is normal_style) or self.border_width != normal_style.border_width:
@ -177,6 +177,7 @@ class TextStyle(DOCXStyle):
return bdr
def serialize(self, styles, normal_style):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr')
@ -273,7 +274,7 @@ class BlockStyle(DOCXStyle):
[x%edge for edge in border_edges for x in border_props]
)
def __init__(self, css, html_block, is_table_cell=False):
def __init__(self, namespace, css, html_block, is_table_cell=False):
read_css_block_borders(self, css)
if is_table_cell:
for edge in border_edges:
@ -298,9 +299,10 @@ class BlockStyle(DOCXStyle):
self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get(
css['text-align'].lower(), 'left')
DOCXStyle.__init__(self)
DOCXStyle.__init__(self, namespace)
def serialize_borders(self, bdr, normal_style):
w = self.w
for edge in border_edges:
e = bdr.makeelement(w(edge))
padding = getattr(self, 'padding_' + edge)
@ -319,6 +321,7 @@ class BlockStyle(DOCXStyle):
return bdr
def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr')
@ -393,11 +396,12 @@ class BlockStyle(DOCXStyle):
class StylesManager(object):
def __init__(self):
def __init__(self, namespace):
self.namespace = namespace
self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style, is_parent_style=False):
ans = TextStyle(css_style, is_parent_style=is_parent_style)
ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style)
existing = self.text_styles.get(ans, None)
if existing is None:
self.text_styles[ans] = ans
@ -406,7 +410,7 @@ class StylesManager(object):
return ans
def create_block_style(self, css_style, html_block, is_table_cell=False):
ans = BlockStyle(css_style, html_block, is_table_cell=is_table_cell)
ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell)
existing = self.block_styles.get(ans, None)
if existing is None:
self.block_styles[ans] = ans

View File

@ -8,7 +8,6 @@ __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple
from calibre.ebooks.docx.names import makeelement
from calibre.ebooks.docx.writer.utils import convert_color
from calibre.ebooks.docx.writer.styles import read_css_block_borders as rcbb, border_edges
@ -29,7 +28,7 @@ class SpannedCell(object):
def resolve_borders(self):
pass
def serialize(self, tr):
def serialize(self, tr, makeelement):
tc = makeelement(tr, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:%sMerge' % ('h' if self.horizontal else 'v'), w_val='continue')
@ -70,14 +69,6 @@ def convert_width(tag_style):
pass
return ('auto', 0)
def serialize_border_edge(self, bdr, edge):
width = getattr(self, 'border_%s_width' % edge)
bstyle = getattr(self, 'border_%s_style' % edge)
if width > 0 and bstyle != 'none':
makeelement(bdr, 'w:' + edge, w_val=bstyle, w_sz=str(width), w_color=getattr(self, 'border_%s_color' % edge))
return True
return False
class Cell(object):
BLEVEL = 2
@ -107,7 +98,7 @@ class Cell(object):
self.items.append(table)
return table
def serialize(self, parent):
def serialize(self, parent, makeelement):
tc = makeelement(parent, 'w:tc')
tcPr = makeelement(tc, 'w:tcPr')
makeelement(tcPr, 'w:tcW', w_type=self.width[0], w_w=str(self.width[1]))
@ -240,16 +231,17 @@ class Row(object):
def add_table(self, table):
return self.current_cell.add_table(table)
def serialize(self, parent):
def serialize(self, parent, makeelement):
tr = makeelement(parent, 'w:tr')
for cell in self.cells:
cell.serialize(tr)
cell.serialize(tr, makeelement)
class Table(object):
BLEVEL = 0
def __init__(self, html_tag, tag_style=None):
def __init__(self, namespace, html_tag, tag_style=None):
self.namespace = namespace
self.html_tag = html_tag
self.rows = []
self.current_row = None
@ -329,6 +321,7 @@ class Table(object):
return self.current_row.add_table(table)
def serialize(self, parent):
makeelement = self.namespace.makeelement
rows = [r for r in self.rows if r.cells]
if not rows:
return
@ -338,4 +331,4 @@ class Table(object):
if self.jc is not None:
makeelement(tblPr, 'w:jc', w_val=self.jc)
for row in rows:
row.serialize(tbl)
row.serialize(tbl, makeelement)

View File

@ -12,14 +12,14 @@ from io import BytesIO
from lxml import etree
from calibre.ebooks.docx.container import DOCX
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str, namespaces
from calibre.ebooks.docx.names import XPath, get
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
from calibre.utils.magick.draw import identify_data
images = XPath('//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]')
def get_cover(docx):
doc = docx.document
get = docx.namespace.get
images = docx.namespace.XPath(
'//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]')
rid_map = docx.document_relationships[0]
for image in images(doc):
rid = get(image, 'r:embed') or get(image, 'r:id')
@ -58,11 +58,11 @@ def set_metadata(stream, mi):
except Exception:
ap_raw = None
cp = etree.fromstring(dp_raw)
update_doc_props(cp, mi)
update_doc_props(cp, mi, c.namespace)
replacements = {}
if ap_raw is not None:
ap = etree.fromstring(ap_raw)
comp = ap.makeelement('{%s}Company' % namespaces['ep'])
comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
for child in tuple(ap):
if child.tag == comp.tag:
ap.remove(child)