DOCX Input: Start work on tables

This commit is contained in:
Kovid Goyal 2013-05-28 09:45:24 +05:30
parent 5887243d56
commit 6eb97d2626
5 changed files with 228 additions and 34 deletions

View File

@ -65,35 +65,41 @@ LINE_STYLES = { # {{{
} # }}}
# Read from XML {{{
def read_border(parent, dest):
tvals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
vals = {}
for edge in ('left', 'top', 'right', 'bottom'):
vals.update({k % edge:v for k, v in tvals.iteritems()})
for border in XPath('./w:pBdr')(parent):
for edge in ('left', 'top', 'right', 'bottom'):
for elem in XPath('./w:%s' % edge)(border):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color' % edge] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s' % edge] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
def read_single_border(parent, edge):
color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color')
if c is not None:
color = simple_color(c)
s = get(elem, 'w:val')
if s is not None:
style = LINE_STYLES.get(s, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent):
for edge in border_edges:
for prop, val in read_single_border(border, edge).iteritems():
if val is not None:
vals[prop % edge] = val
for key, val in vals.iteritems():
setattr(dest, key, val)

View File

@ -17,7 +17,7 @@ class Note(object):
self.parent = parent
def __iter__(self):
for p in descendants(self.parent, 'w:p'):
for p in descendants(self.parent, 'w:p', 'w:tbl'):
yield p
class Footnotes(object):

View File

@ -11,6 +11,7 @@ from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
from calibre.ebooks.docx.names import XPath, get
class PageProperties(object):
@ -66,10 +67,17 @@ class Style(object):
self.based_on = None
self.is_default = get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = None
self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character'}:
if self.style_type == 'paragraph':
if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table':
for tblPr in XPath('./w:tblPr')(elem):
ts = TableStyle(tblPr)
if self.table_style is None:
self.table_style = ts
else:
self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}:
for pPr in XPath('./w:pPr')(elem):
ps = ParagraphStyle(pPr)
if self.paragraph_style is None:
@ -90,6 +98,10 @@ class Style(object):
self.numbering_style_link = get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.table_style is not None:
if self.table_style is None:
self.table_style = TableStyle()
self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle()

View File

@ -0,0 +1,152 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd, read_border ,border_props # noqa
from calibre.ebooks.docx.names import XPath, get
def _read_width(elem):
ans = inherit
try:
w = int(get(elem, 'w:w'))
except (TypeError, ValueError):
w = 0
typ = get(elem, 'w:type', 'auto')
if typ == 'nil':
ans = '0'
elif typ == 'auto':
ans = 'auto'
elif typ == 'dxa':
ans = '%.3gpt' % (w/20)
elif typ == 'pct':
ans = '%.3g%%' % (w/50)
return ans
def read_width(parent, dest):
ans = inherit
for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW)
setattr(dest, 'width', ans)
def read_padding(parent, dest):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
left = top = bottom = right = inherit
for mar in XPath('./w:%s' % name)(parent):
for x in ('left', 'top', 'right', 'bottom'):
for edge in XPath('./w:%s' % x)(mar):
locals()[x] = _read_width(edge)
for x in ('left', 'top', 'right', 'bottom'):
setattr(dest, 'cell_padding_%s' % x, locals()[x])
def read_justification(parent, dest):
left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val == 'left':
right = 'auto'
elif val == 'right':
left = 'auto'
elif val == 'center':
left = right = 'auto'
setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right)
def read_spacing(parent, dest):
ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs)
setattr(dest, 'spacing', ans)
def read_indent(parent, dest):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs)
setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, border_edges, name)
class TableStyle(object):
all_properties = (
'width', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
'spacing', 'indent',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tblPr=None):
if tblPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('width', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders'):
f = globals()['read_%s' % x]
f(tblPr, self)
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
return self._css
class Tables(object):
def __init__(self):
self.tables = OrderedDict()
def register(self, tbl):
self.tables[tbl] = self.current_table = []
def add(self, p):
self.current_table.append(p)
def apply_markup(self, object_map):
rmap = {v:k for k, v in object_map.iteritems()}
for tbl, blocks in self.tables.iteritems():
if not blocks:
continue
parent = rmap[blocks[0]].getparent()
table = TABLE('\n\t\t')
idx = parent.index(rmap[blocks[0]])
parent.insert(idx, table)
for row in XPath('./w:tr')(tbl):
tr = TR('\n\t\t\t')
tr.tail = '\n\t\t'
table.append(tr)
for tc in XPath('./w:tc')(row):
td = TD()
td.tail = '\n\t\t\t'
tr.append(td)
for p in XPath('./w:p')(tc):
block = rmap[p]
td.append(block)
if len(tr):
tr[-1].tail = '\n\t\t'
if len(table):
table[-1].tail = '\n\t'

View File

@ -21,6 +21,7 @@ from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
@ -47,6 +48,7 @@ class Convert(object):
self.body = BODY()
self.styles = Styles()
self.images = Images()
self.tables = Tables()
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
@ -98,15 +100,26 @@ class Convert(object):
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text), id=anchor))
dl[-1][0].tail = ']'
dl.append(DD())
in_table = False
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp)
in_table = True
continue
if in_table:
if ancestor(wp, 'w:tbl') is not None:
self.tables.add(wp)
else:
in_table = False
p = self.convert_p(wp)
dl[-1].append(p)
self.resolve_links(relationships_by_id)
# TODO: tables <w:tbl> child of <w:body> (nested tables?)
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map)
numbered = []
for html_obj, obj in self.object_map.iteritems():
raw = obj.get('calibre_num_id', None)
@ -154,7 +167,13 @@ class Convert(object):
current = []
self.page_map = OrderedDict()
for p in descendants(doc, 'w:p'):
in_table = False
for p in descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
in_table = True
self.tables.register(p)
continue
sect = tuple(descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(sect)
@ -163,6 +182,11 @@ class Convert(object):
current = []
else:
current.append(p)
if in_table:
if ancestor(p, 'w:tbl') is not None:
self.tables.add(p)
else:
in_table = False
if current:
last = XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(last)