DOCX Output: Start work on conversion of tables

This commit is contained in:
Kovid Goyal 2015-04-01 15:14:55 +05:30
parent de5ce8d46b
commit 0118d5d181
3 changed files with 181 additions and 31 deletions

View File

@ -4,7 +4,6 @@ Links
Various TODOs sprinkled through the source Various TODOs sprinkled through the source
Tables Tables
Lists Lists
Embed Fonts
Cover image Cover image
RTL text RTL text
Lang support in run styles <w:lang> Lang support in run styles <w:lang>

View File

@ -12,6 +12,7 @@ from calibre.ebooks.docx.writer.container import create_skeleton
from calibre.ebooks.docx.writer.styles import w, StylesManager from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.docx.writer.images import ImagesManager from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename from calibre.ebooks.oeb.base import XPath, barename
@ -157,23 +158,74 @@ class Block(object):
class Blocks(object): class Blocks(object):
def __init__(self): def __init__(self, styles_manager):
self.styles_manager = styles_manager
self.all_blocks = [] self.all_blocks = []
self.pos = 0 self.pos = 0
self.current_block = None self.current_block = None
self.items = []
self.tables = []
self.current_table = None
self.open_html_blocks = set()
def start_new_block(self, styles_manager, html_block, style): def current_or_new_block(self, html_tag, tag_style):
return self.current_block or self.start_new_block(html_tag, tag_style)
def end_current_block(self):
if self.current_block is not None: if self.current_block is not None:
self.all_blocks.append(self.current_block) self.all_blocks.append(self.current_block)
self.current_block = Block(styles_manager, html_block, style) if self.current_table is not None:
self.current_table.add_block(self.current_block)
else:
self.block_map[self.current_block] = len(self.items)
self.items.append(self.current_block)
self.current_block = None
def start_new_block(self, html_block, style):
self.end_current_block()
self.current_block = Block(self.styles_manager, html_block, style)
self.open_html_blocks.add(html_block)
return self.current_block return self.current_block
def start_new_table(self, html_tag, tag_style=None):
self.current_table = Table(html_tag, tag_style)
self.tables.append(self.current_table)
def start_new_row(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_row(html_tag, tag_style)
def start_new_cell(self, html_tag, tag_style):
if self.current_table is None:
self.start_new_table(html_tag)
self.current_table.start_new_cell(html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_block is not None and html_tag in self.open_html_blocks:
self.end_current_block()
self.open_html_blocks.discard(html_tag)
if self.current_table is not None:
table_finished = self.current_table.finish_tag(html_tag)
if table_finished:
table = self.tables[-1]
del self.tables[-1]
if self.tables:
self.current_table = self.tables[-1]
self.current_table.add_table(table)
else:
self.current_table = None
self.block_map[table] = len(self.items)
self.items.append(table)
def serialize(self, body): def serialize(self, body):
for block in self.all_blocks: for item in self.items:
block.serialize(body) item.serialize(body)
def __enter__(self): def __enter__(self):
self.pos = len(self.all_blocks) self.pos = len(self.all_blocks)
self.block_map = {}
def __exit__(self, *args): def __exit__(self, *args):
if self.current_block is not None: if self.current_block is not None:
@ -182,7 +234,9 @@ class Blocks(object):
if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty(): if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
# Delete the empty block corresponding to the <body> tag when the # Delete the empty block corresponding to the <body> tag when the
# body tag has no inline content before its first sub-block # body tag has no inline content before its first sub-block
block = self.all_blocks[self.pos]
del self.all_blocks[self.pos] del self.all_blocks[self.pos]
del self.items[self.block_map.pop(block)]
if self.pos > 0 and self.pos < len(self.all_blocks): if self.pos > 0 and self.pos < len(self.all_blocks):
# Insert a page break corresponding to the start of the html file # Insert a page break corresponding to the start of the html file
self.all_blocks[self.pos].page_break_before = True self.all_blocks[self.pos].page_break_before = True
@ -193,8 +247,6 @@ class Convert(object):
self.oeb, self.docx = oeb, docx self.oeb, self.docx = oeb, docx
self.log, self.opts = docx.log, docx.opts self.log, self.opts = docx.log, docx.opts
self.blocks = Blocks()
def __call__(self): def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer = SVGRasterizer()
@ -203,6 +255,7 @@ class Convert(object):
self.styles_manager = StylesManager() self.styles_manager = StylesManager()
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.fonts_manager = FontsManager(self.oeb, self.opts) self.fonts_manager = FontsManager(self.oeb, self.opts)
self.blocks = Blocks(self.styles_manager)
for item in self.oeb.spine: for item in self.oeb.spine:
self.process_item(item) self.process_item(item)
@ -228,61 +281,63 @@ class Convert(object):
if tag_style.is_hidden: if tag_style.is_hidden:
return return
display = tag_style._get('display') display = tag_style._get('display')
inlined = True
if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph
if self.blocks.current_block is not None:
self.add_inline_tag(tagname, html_tag, tag_style, stylizer) self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
elif display == 'list-item': elif display == 'list-item':
# TODO: Implement this # TODO: Implement this
inlined = False
self.add_block_tag(tagname, html_tag, tag_style, stylizer) self.add_block_tag(tagname, html_tag, tag_style, stylizer)
elif display.startswith('table') or display == 'inline-table': elif display.startswith('table') or display == 'inline-table':
inlined = False
# TODO: implement this
if display == 'table-cell': if display == 'table-cell':
self.blocks.start_new_cell(html_tag, tag_style)
self.add_block_tag(tagname, html_tag, tag_style, stylizer) self.add_block_tag(tagname, html_tag, tag_style, stylizer)
elif display == 'table-row':
self.blocks.start_new_row(html_tag, tag_style)
elif display in {'table', 'inline-table'}:
self.blocks.start_new_table(html_tag, tag_style)
else: else:
if tagname == 'img' and tag_style['float'] in {'left', 'right'}: if tagname == 'img' and tag_style['float'] in {'left', 'right'}:
# Image is floating so dont start a new paragraph for it # Image is floating so dont start a new paragraph for it
self.add_inline_tag(tagname, html_tag, tag_style, stylizer) self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
else: else:
self.add_block_tag(tagname, html_tag, tag_style, stylizer) self.add_block_tag(tagname, html_tag, tag_style, stylizer)
inlined = False
for child in html_tag.iterchildren('*'): for child in html_tag.iterchildren('*'):
self.process_tag(child, stylizer) self.process_tag(child, stylizer)
if not is_first_tag and html_tag.tail: is_block = html_tag in self.blocks.open_html_blocks
if inlined: self.blocks.finish_tag(html_tag)
self.add_text_to_current_block(html_tag.tail, stylizer.style(html_tag.getparent())) if is_block and tag_style['page-break-after'] == 'avoid':
elif html_tag.tail.strip(): self.blocks.all_blocks[-1].keep_next = True
self.blocks.start_new_block(self.styles_manager, html_tag.getparent(), stylizer.style(html_tag.getparent()))
self.add_text_to_current_block(html_tag.tail, stylizer.style(html_tag.getparent()))
def add_text_to_current_block(self, text, tag_style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False): if display == 'table-row':
block = self.blocks.current_block return # We ignore the tail for these tags
if block is not None:
block.add_text(text, tag_style, ignore_leading_whitespace=ignore_leading_whitespace, html_parent=html_parent, is_parent_style=is_parent_style) if not is_first_tag and html_tag.tail and (not is_block or not html_tag.tail.isspace()):
# Ignore trailing space after a block tag, as otherwise it will
# become a new empty paragraph
block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True)
def add_block_tag(self, tagname, html_tag, tag_style, stylizer): def add_block_tag(self, tagname, html_tag, tag_style, stylizer):
block = self.blocks.start_new_block(self.styles_manager, html_tag, tag_style) block = self.blocks.start_new_block(html_tag, tag_style)
if tagname == 'img': if tagname == 'img':
self.images_manager.add_image(html_tag, block, stylizer) self.images_manager.add_image(html_tag, block, stylizer)
else: else:
if html_tag.text: if html_tag.text:
block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True) block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True)
if tag_style['page-break-after'] == 'avoid':
block.keep_next = True
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
if tagname == 'br': if tagname == 'br':
if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
self.blocks.current_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none')) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'))
elif tagname == 'img': elif tagname == 'img':
self.images_manager.add_image(html_tag, self.blocks.current_block, stylizer) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
self.images_manager.add_image(html_tag, block, stylizer)
else: else:
if html_tag.text: if html_tag.text:
self.add_text_to_current_block(html_tag.text, tag_style, html_parent=html_tag) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_text(html_tag.text, tag_style, is_parent_style=False)
def write(self): def write(self):
self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.docx.document, self.docx.styles, body = create_skeleton(self.opts)

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import makeelement
class Cell(object):
def __init__(self, html_tag, tag_style):
self.html_tag = html_tag
self.items = []
def add_block(self, block):
self.items.append(block)
def add_table(self, table):
self.items.append(table)
return table
def serialize(self, parent):
tc = makeelement(parent, 'w:tc')
for item in self.items:
item.serialize(tc)
class Row(object):
def __init__(self, html_tag, tag_style=None):
self.html_tag = html_tag
self.cells = []
self.current_cell = None
def start_new_cell(self, html_tag, tag_style):
self.current_cell = Cell(html_tag, tag_style)
def finish_tag(self, html_tag):
if self.current_cell is not None:
if html_tag is self.current_cell.html_tag:
self.cells.append(self.current_cell)
self.current_cell = None
def add_block(self, block):
self.current_cell.add_block(block)
def add_table(self, table):
return self.current_cell.add_table(table)
def serialize(self, parent):
tr = makeelement(parent, 'w:tr')
for cell in self.cells:
cell.serialize(tr)
class Table(object):
def __init__(self, html_tag, tag_style=None):
self.html_tag = html_tag
self.rows = []
self.current_row = None
def finish_tag(self, html_tag):
if self.current_row is not None:
self.current_row.finish_tag(html_tag)
if self.current_row.html_tag is html_tag:
self.rows.append(self.current_row)
self.current_row = None
table_ended = self.html_tag is html_tag
return table_ended
def start_new_row(self, html_tag, html_style):
if self.current_row is not None:
self.rows.append(self.current_row)
self.current_row = Row(html_tag, html_style)
def start_new_cell(self, html_tag, html_style):
if self.current_row is None:
self.start_new_row(html_tag, None)
self.current_row.start_new_cell(html_tag, html_style)
def add_block(self, block):
self.current_row.add_block(block)
def add_table(self, table):
return self.current_row.add_table(table)
def serialize(self, parent):
rows = [r for r in self.rows if r.cells]
if not rows:
return
tbl = makeelement(parent, 'w:tbl')
tblPr = makeelement(tbl, 'w:tblPr')
tblPr
for row in rows:
row.serialize(tbl)