From ca0899956dff819ffa798f41f44c86ab1ac37692 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 2 Jun 2013 10:39:02 +0530 Subject: [PATCH] DOCX: Nested tables and block/run table styles --- src/calibre/ebooks/docx/styles.py | 9 ++- src/calibre/ebooks/docx/tables.py | 111 +++++++++++++++++++++-------- src/calibre/ebooks/docx/to_html.py | 25 ++----- 3 files changed, 94 insertions(+), 51 deletions(-) diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index f164b194b5..72046ebda3 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -118,7 +118,7 @@ class Styles(object): Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup. ''' - def __init__(self): + def __init__(self, tables): self.id_map = OrderedDict() self.para_cache = {} self.para_char_cache = {} @@ -126,6 +126,7 @@ class Styles(object): self.classes = {} self.counter = Counter() self.default_styles = {} + self.tables = tables self.numbering_style_links = {} def __iter__(self): @@ -226,6 +227,9 @@ class Styles(object): parent_styles = [] if self.default_paragraph_style is not None: parent_styles.append(self.default_paragraph_style) + ts = self.tables.para_style(p) + if ts is not None: + parent_styles.append(ts) default_para = self.default_styles.get('paragraph', None) if direct_formatting.linked_style is not None: @@ -278,6 +282,9 @@ class Styles(object): default_char = self.default_styles.get('character', None) if self.default_character_style is not None: parent_styles.append(self.default_character_style) + ts = self.tables.run_style(p) + if ts is not None: + parent_styles.append(ts) pstyle = self.para_char_cache.get(p, None) if pstyle is not None: parent_styles.append(pstyle) diff --git a/src/calibre/ebooks/docx/tables.py b/src/calibre/ebooks/docx/tables.py index feb76bfb71..f369afe4a2 100644 --- a/src/calibre/ebooks/docx/tables.py +++ b/src/calibre/ebooks/docx/tables.py @@ -6,8 +6,6 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -from collections import OrderedDict - from lxml.html.builder import TABLE, TR, TD from calibre.ebooks.docx.block_styles import inherit, read_shd, read_border, binary_property, border_props, ParagraphStyle # noqa @@ -212,41 +210,94 @@ class TableStyle(object): if val is inherit: setattr(self, p, getattr(parent, p)) +class Table(object): + + def __init__(self, tbl, styles, para_map): + self.tbl = tbl + self.styles = styles + + # Read Table Style + style = {'table':TableStyle()} + for tblPr in XPath('./w:tblPr')(tbl): + for ts in XPath('./w:tblStyle[@w:val]')(tblPr): + style_id = get(ts, 'w:val') + s = styles.get(style_id) + if s is not None: + if s.table_style is not None: + style['table'].update(s.table_style) + if s.paragraph_style is not None: + if 'paragraph' in style: + style['paragraph'].update(s.paragraph_style) + else: + style['paragraph'] = s.paragraph_style + if s.character_style is not None: + if 'run' in style: + style['run'].update(s.character_style) + else: + style['run'] = s.character_style + style['table'].update(TableStyle(tblPr)) + self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None) + self.run_style = style.get('run', None) + self.paragraphs = XPath('./w:tr/w:tc/w:p')(tbl) + + self.sub_tables = {x:Table(x, styles, para_map) for x in XPath('./w:tr/w:tc/w:tbl')(tbl)} + para_map.update({p:self for p in self.paragraphs}) + + def __iter__(self): + for p in self.paragraphs: + yield p + for t in self.sub_tables.itervalues(): + for p in t: + yield p + + def apply_markup(self, rmap, parent=None): + table = TABLE('\n\t\t') + if parent is None: + try: + first_para = rmap[next(iter(self))] + except StopIteration: + return + parent = first_para.getparent() + idx = parent.index(first_para) + parent.insert(idx, table) + else: + parent.append(table) + for row in XPath('./w:tr')(self.tbl): + tr = TR('\n\t\t\t') + tr.tail = '\n\t\t' + table.append(tr) + for tc in XPath('./w:tc')(row): + td = TD() + td.tail = '\n\t\t\t' + tr.append(td) + for x in XPath('./w:p|./w:tbl')(tc): + if x.tag.endswith('}p'): + td.append(rmap[x]) + else: + self.sub_tables[x].apply_markup(rmap, parent=td) + if len(tr): + tr[-1].tail = '\n\t\t' + if len(table): + table[-1].tail = '\n\t' + class Tables(object): def __init__(self): - self.tables = OrderedDict() + self.tables = [] + self.para_map = {} - def register(self, tbl): - self.tables[tbl] = self.current_table = [] - - def add(self, p): - self.current_table.append(p) + def register(self, tbl, styles): + self.tables.append(Table(tbl, styles, self.para_map)) def apply_markup(self, object_map): rmap = {v:k for k, v in object_map.iteritems()} - for tbl, blocks in self.tables.iteritems(): - if not blocks: - continue - parent = rmap[blocks[0]].getparent() - table = TABLE('\n\t\t') - idx = parent.index(rmap[blocks[0]]) - parent.insert(idx, table) - for row in XPath('./w:tr')(tbl): - tr = TR('\n\t\t\t') - tr.tail = '\n\t\t' - table.append(tr) - for tc in XPath('./w:tc')(row): - td = TD() - td.tail = '\n\t\t\t' - tr.append(td) - for p in XPath('./w:p')(tc): - block = rmap[p] - td.append(block) - if len(tr): - tr[-1].tail = '\n\t\t' - if len(table): - table[-1].tail = '\n\t' + for table in self.tables: + table.apply_markup(rmap) + def para_style(self, p): + return getattr(self.para_map.get(p, None), 'paragraph_style', None) + + def run_style(self, p): + return getattr(self.para_map.get(p, None), 'run_style', None) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 1250b60dd2..2f945e8980 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -46,9 +46,9 @@ class Convert(object): self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata self.body = BODY() - self.styles = Styles() - self.images = Images() self.tables = Tables() + self.styles = Styles(self.tables) + self.images = Images() self.object_map = OrderedDict() self.html = HTML( HEAD( @@ -100,17 +100,9 @@ class Convert(object): dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) - in_table = False for wp in note: if wp.tag.endswith('}tbl'): - self.tables.register(wp) - in_table = True - continue - if in_table: - if ancestor(wp, 'w:tbl') is not None: - self.tables.add(wp) - else: - in_table = False + self.tables.register(wp, self.styles) p = self.convert_p(wp) dl[-1].append(p) @@ -167,12 +159,9 @@ class Convert(object): current = [] self.page_map = OrderedDict() - in_table = False - for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): - in_table = True - self.tables.register(p) + self.tables.register(p, self.styles) continue sect = tuple(descendants(p, 'w:sectPr')) if sect: @@ -182,11 +171,7 @@ class Convert(object): current = [] else: current.append(p) - if in_table: - if ancestor(p, 'w:tbl') is not None: - self.tables.add(p) - else: - in_table = False + if current: last = XPath('./w:body/w:sectPr')(doc) pr = PageProperties(last)