mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor HTML tag processing in preparation for tables and lists
This commit is contained in:
parent
a2319a1338
commit
6d03706aa3
@ -8,8 +8,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.docx.writer.container import create_skeleton
|
from calibre.ebooks.docx.writer.container import create_skeleton
|
||||||
from calibre.ebooks.docx.writer.styles import w, StylesManager
|
from calibre.ebooks.docx.writer.styles import w, StylesManager
|
||||||
from calibre.ebooks.docx.writer.images import ImagesManager
|
from calibre.ebooks.docx.writer.images import ImagesManager
|
||||||
@ -154,13 +152,42 @@ class Block(object):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
class Blocks(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.all_blocks = []
|
||||||
|
self.pos = 0
|
||||||
|
self.current_block = None
|
||||||
|
|
||||||
|
def start_new_block(self, styles_manager, html_block, style, is_first_tag=False):
|
||||||
|
if self.current_block is not None:
|
||||||
|
self.all_blocks.append(self.current_block)
|
||||||
|
self.current_block = Block(styles_manager, html_block, style, is_first_block=is_first_tag)
|
||||||
|
return self.current_block
|
||||||
|
|
||||||
|
def serialize(self, body):
|
||||||
|
for block in self.all_blocks:
|
||||||
|
block.serialize(body)
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.pos = len(self.all_blocks)
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
if self.current_block is not None:
|
||||||
|
self.all_blocks.append(self.current_block)
|
||||||
|
self.current_block = None
|
||||||
|
if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty():
|
||||||
|
# Delete the empty block corresponding to the <body> tag when the
|
||||||
|
# body tag has no text content before its first sub-block
|
||||||
|
del self.all_blocks[self.pos]
|
||||||
|
|
||||||
class Convert(object):
|
class Convert(object):
|
||||||
|
|
||||||
def __init__(self, oeb, docx):
|
def __init__(self, oeb, docx):
|
||||||
self.oeb, self.docx = oeb, docx
|
self.oeb, self.docx = oeb, docx
|
||||||
self.log, self.opts = docx.log, docx.opts
|
self.log, self.opts = docx.log, docx.opts
|
||||||
|
|
||||||
self.blocks = []
|
self.blocks = Blocks()
|
||||||
|
|
||||||
def __call__(self):
|
def __call__(self):
|
||||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
||||||
@ -174,7 +201,7 @@ class Convert(object):
|
|||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
self.process_item(item)
|
self.process_item(item)
|
||||||
|
|
||||||
self.styles_manager.finalize(self.blocks)
|
self.styles_manager.finalize(self.blocks.all_blocks)
|
||||||
self.write()
|
self.write()
|
||||||
|
|
||||||
def process_item(self, item):
|
def process_item(self, item):
|
||||||
@ -183,80 +210,78 @@ class Convert(object):
|
|||||||
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
|
||||||
self.abshref = self.images_manager.abshref = item.abshref
|
self.abshref = self.images_manager.abshref = item.abshref
|
||||||
|
|
||||||
is_first_block = True
|
for i, body in enumerate(XPath('//h:body')(item.data)):
|
||||||
for body in XPath('//h:body')(item.data):
|
with self.blocks:
|
||||||
b = Block(self.styles_manager, body, stylizer.style(body), is_first_block=is_first_block)
|
self.process_tag(body, stylizer, is_first_tag=i == 0)
|
||||||
self.blocks.append(b)
|
|
||||||
is_first_block = False
|
|
||||||
self.process_block(body, b, stylizer, ignore_tail=True)
|
|
||||||
if self.blocks and self.blocks[0].is_empty():
|
|
||||||
del self.blocks[0]
|
|
||||||
|
|
||||||
def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
|
def process_tag(self, html_tag, stylizer, is_first_tag=False):
|
||||||
block_style = stylizer.style(html_block)
|
tagname = barename(html_tag.tag)
|
||||||
if block_style.is_hidden:
|
if tagname in {'script', 'style', 'title', 'meta'}:
|
||||||
return
|
return
|
||||||
if html_block.tag.endswith('}img'):
|
tag_style = stylizer.style(html_tag)
|
||||||
self.images_manager.add_image(html_block, docx_block, stylizer)
|
if tag_style.is_hidden:
|
||||||
else:
|
|
||||||
if html_block.text:
|
|
||||||
docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
|
|
||||||
|
|
||||||
for child in html_block.iterchildren(etree.Element):
|
|
||||||
tag = barename(child.tag)
|
|
||||||
style = stylizer.style(child)
|
|
||||||
display = style._get('display')
|
|
||||||
if display == 'block' and tag != 'br':
|
|
||||||
if tag == 'img' and style['float'] in {'left', 'right'}:
|
|
||||||
# Image is floating so dont start a new paragraph for
|
|
||||||
# it
|
|
||||||
self.process_inline(child, self.blocks[-1], stylizer)
|
|
||||||
else:
|
|
||||||
b = Block(self.styles_manager, child, style)
|
|
||||||
self.blocks.append(b)
|
|
||||||
self.process_block(child, b, stylizer)
|
|
||||||
else:
|
|
||||||
self.process_inline(child, self.blocks[-1], stylizer)
|
|
||||||
|
|
||||||
if block_style['page-break-after'] == 'avoid':
|
|
||||||
self.blocks[-1].keep_next = True
|
|
||||||
|
|
||||||
if ignore_tail is False and html_block.tail and html_block.tail.strip():
|
|
||||||
style = stylizer.style(html_block.getparent())
|
|
||||||
b = Block(self.styles_manager, html_block.getparent(), style)
|
|
||||||
self.blocks.append(b)
|
|
||||||
b.add_text(html_block.tail, style, is_parent_style=True)
|
|
||||||
|
|
||||||
def process_inline(self, html_child, docx_block, stylizer):
|
|
||||||
tag = barename(html_child.tag)
|
|
||||||
style = stylizer.style(html_child)
|
|
||||||
if style.is_hidden:
|
|
||||||
return
|
return
|
||||||
if tag == 'br':
|
display = tag_style._get('display')
|
||||||
if html_child.tail or html_child is not html_child.getparent()[-1]:
|
inlined = True
|
||||||
docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
|
if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph
|
||||||
elif tag == 'img':
|
if self.blocks.current_block is not None:
|
||||||
self.images_manager.add_image(html_child, docx_block, stylizer)
|
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
|
||||||
|
elif display == 'list-item':
|
||||||
|
# TODO: Implement this
|
||||||
|
inlined = False
|
||||||
|
self.add_block_tag(tagname, html_tag, tag_style, stylizer)
|
||||||
|
elif display.startswith('table') or display == 'inline-table':
|
||||||
|
inlined = False
|
||||||
|
# TODO: implement this
|
||||||
|
if display == 'table-cell':
|
||||||
|
self.add_block_tag(tagname, html_tag, tag_style, stylizer)
|
||||||
else:
|
else:
|
||||||
if html_child.text:
|
if tagname == 'img' and tag_style['float'] in {'left', 'right'}:
|
||||||
docx_block.add_text(html_child.text, style, html_parent=html_child)
|
# Image is floating so dont start a new paragraph for it
|
||||||
for child in html_child.iterchildren(etree.Element):
|
self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
|
||||||
style = stylizer.style(child)
|
else:
|
||||||
display = style.get('display', 'inline')
|
self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_first_tag=is_first_tag)
|
||||||
if display == 'block':
|
inlined = False
|
||||||
b = Block(self.styles_manager, child, style)
|
|
||||||
self.blocks.append(b)
|
|
||||||
self.process_block(child, b, stylizer)
|
|
||||||
else:
|
|
||||||
self.process_inline(child, self.blocks[-1], stylizer)
|
|
||||||
|
|
||||||
if html_child.tail:
|
for child in html_tag.iterchildren('*'):
|
||||||
self.blocks[-1].add_text(html_child.tail, stylizer.style(html_child.getparent()), html_parent=html_child.getparent(), is_parent_style=True)
|
self.process_tag(child, stylizer)
|
||||||
|
|
||||||
|
if not is_first_tag and html_tag.tail:
|
||||||
|
if inlined:
|
||||||
|
self.add_text_to_current_block(html_tag.tail, stylizer.style(html_tag.getparent()))
|
||||||
|
elif html_tag.tail.strip():
|
||||||
|
self.blocks.start_new_block(self.styles_manager, html_tag.getparent(), stylizer.style(html_tag.getparent()))
|
||||||
|
self.add_text_to_current_block(html_tag.tail, stylizer.style(html_tag.getparent()))
|
||||||
|
|
||||||
|
def add_text_to_current_block(self, text, tag_style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False):
|
||||||
|
block = self.blocks.current_block
|
||||||
|
if block is not None:
|
||||||
|
block.add_text(text, tag_style, ignore_leading_whitespace=ignore_leading_whitespace, html_parent=html_parent, is_parent_style=is_parent_style)
|
||||||
|
|
||||||
|
def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_first_tag=False):
|
||||||
|
block = self.blocks.start_new_block(self.styles_manager, html_tag, tag_style, is_first_tag=is_first_tag)
|
||||||
|
if tagname == 'img':
|
||||||
|
self.images_manager.add_image(html_tag, block, stylizer)
|
||||||
|
else:
|
||||||
|
if html_tag.text:
|
||||||
|
block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True)
|
||||||
|
if tag_style['page-break-after'] == 'avoid':
|
||||||
|
block.keep_next = True
|
||||||
|
|
||||||
|
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
|
||||||
|
if tagname == 'br':
|
||||||
|
if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
|
||||||
|
self.blocks.current_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'))
|
||||||
|
elif tagname == 'img':
|
||||||
|
self.images_manager.add_image(html_tag, self.blocks.current_block, stylizer)
|
||||||
|
else:
|
||||||
|
if html_tag.text:
|
||||||
|
self.add_text_to_current_block(html_tag.text, tag_style, html_parent=html_tag)
|
||||||
|
|
||||||
def write(self):
|
def write(self):
|
||||||
self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
|
self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
|
||||||
for block in self.blocks:
|
self.blocks.serialize(body)
|
||||||
block.serialize(body)
|
body.append(body[0]) # Move <sectPr> to the end
|
||||||
self.styles_manager.serialize(self.docx.styles)
|
self.styles_manager.serialize(self.docx.styles)
|
||||||
self.images_manager.serialize(self.docx.images)
|
self.images_manager.serialize(self.docx.images)
|
||||||
self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
|
self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user