More work on the DOCX input plugin, we can now read block styles

This commit is contained in:
Kovid Goyal 2013-05-05 21:28:44 +05:30
parent 8ff4ff2aa0
commit 159c08b97e
4 changed files with 417 additions and 8 deletions

View File

@ -105,6 +105,9 @@ class DOCX(object):
name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
self.names[name] = f
def exists(self, name):
return name in self.names
def read(self, name):
if hasattr(self, 'zipf'):
return self.zipf.open(name).read()
@ -149,14 +152,39 @@ class DOCX(object):
self.relationships_rmap[target] = typ
@property
def document(self):
def document_name(self):
name = self.relationships.get(DOCUMENT, None)
if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
if not names:
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
name = names[0]
return fromstring(self.read(name))
return name
@property
def document(self):
return fromstring(self.read(self.document_name))
@property
def document_relationships(self):
name = self.document_name
base = '/'.join(name.split('/')[:-1])
by_id, by_type = {}, {}
parts = name.split('/')
name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
try:
raw = self.read(name)
except KeyError:
pass
else:
root = fromstring(raw)
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = '/'.join((base, item.get('Target').lstrip('/')))
typ = item.get('Type')
Id = item.get('Id')
by_id[Id] = by_type[typ] = target
return by_id, by_type
@property
def metadata(self):

View File

@ -11,6 +11,7 @@ from lxml.etree import XPath as X
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
@ -20,6 +21,7 @@ namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10': 'urn:schemas-microsoft-com:office:word',
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
'xml': 'http://www.w3.org/XML/1998/namespace',
# Drawing
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
@ -45,3 +47,18 @@ namespaces = {
def XPath(expr):
return X(expr, namespaces=namespaces)
def is_tag(x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (namespaces.get(ns, None), name) == tag
def barename(x):
return x.rpartition('}')[-1]
def XML(x):
return '{%s}%s' % (namespaces['xml'], x)
def get(x, attr, default=None):
ns, name = attr.partition(':')[0::2]
return x.attrib.get('{%s}%s' % (namespaces[ns], name), default)

View File

@ -0,0 +1,263 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import XPath, get
class Inherit:
pass
inherit = Inherit()
def binary_property(parent, name):
vals = XPath('./w:%s')
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col):
if not col or col == 'auto' or len(col) != 6:
return 'black'
return '#'+col
def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
return None
# Block styles {{{
LINE_STYLES = { # {{{
'basicBlackDashes': 'dashed',
'basicBlackDots': 'dotted',
'basicBlackSquares': 'dashed',
'basicThinLines': 'solid',
'dashDotStroked': 'groove',
'dashed': 'dashed',
'dashSmallGap': 'dashed',
'dotDash': 'dashed',
'dotDotDash': 'dashed',
'dotted': 'dotted',
'double': 'double',
'inset': 'inset',
'nil': 'none',
'none': 'none',
'outset': 'outset',
'single': 'solid',
'thick': 'solid',
'thickThinLargeGap': 'double',
'thickThinMediumGap': 'double',
'thickThinSmallGap' : 'double',
'thinThickLargeGap': 'double',
'thinThickMediumGap': 'double',
'thinThickSmallGap': 'double',
'thinThickThinLargeGap': 'double',
'thinThickThinMediumGap': 'double',
'thinThickThinSmallGap': 'double',
'threeDEmboss': 'ridge',
'threeDEngrave': 'groove',
'triple': 'double',
} # }}}
def read_border(border, dest):
all_attrs = set()
for edge in ('left', 'top', 'right', 'bottom'):
vals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
all_attrs |= {key % edge for key in vals}
for elem in XPath('./w:%s' % edge):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color'] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style'] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s'] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:space')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width'] = min(96, max(2, float(sz))) * 8
except (ValueError, TypeError):
pass
for key, val in vals.iteritems():
setattr(dest, key % edge, val)
return all_attrs
def read_indent(parent, dest):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
if pl is not None:
padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt')
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
if pr is not None:
padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt')
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'padding_left', padding_left)
setattr(dest, 'padding_right', padding_right)
setattr(dest, 'text_indent', text_indent)
return {'padding_left', 'padding_right', 'text_indent'}
def read_justification(parent, dest):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
ans = 'justify'
if val in {'left', 'center', 'right',}:
ans = val
setattr(dest, 'text_align', ans)
return {'text_align'}
def read_spacing(parent, dest):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
if pb is not None:
padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt')
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
if pt is not None:
padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt')
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
setattr(dest, 'padding_top', padding_top)
setattr(dest, 'padding_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
return {'padding_top', 'padding_bottom', 'line_height'}
def read_direction(parent, dest):
ans = inherit
for jc in XPath('./w:textFlow[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if 'rl' in val.lower():
ans = 'rtl'
setattr(dest, 'direction', ans)
return {'direction'}
class ParagraphStyle(object):
border_path = XPath('./w:pBdr')
def __init__(self, pPr):
self.all_properties = set()
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN',
'bidi', 'contextualSpacing', 'keepLines', 'keepNext',
'mirrorIndents', 'pageBreakBefore', 'snapToGrid',
'suppressLineNumbers', 'suppressOverlap', 'topLinePunct',
'widowControl', 'wordWrap',
):
self.all_properties.add(p)
setattr(p, binary_property(pPr, p))
for border in self.border_path(pPr):
self.all_properties |= read_border(border, self)
self.all_properties |= read_indent(pPr, self)
self.all_properties |= read_justification(pPr, self)
self.all_properties |= read_spacing(pPr, self)
self.all_properties |= read_direction(pPr, self)
# TODO: numPr and outlineLvl
# }}}
class Style(object):
name_path = XPath('./w:name[@w:val]')
based_on_path = XPath('./w:basedOn[@w:val]')
link_path = XPath('./w:link[@w:val]')
def __init__(self, elem):
self.style_id = get(elem, 'w:styleId')
self.style_type = get(elem, 'w:type')
names = self.name_path(elem)
self.name = get(names[-1], 'w:val') if names else None
based_on = self.based_on_path(elem)
self.based_on = get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering':
self.based_on = None
link = self.link_path(elem)
self.link = get(link[0], 'w:val') if link else None
if self.style_type not in {'paragraph', 'character'}:
self.link = None
class Styles(object):
def __init__(self):
self.id_map = OrderedDict()
def __iter__(self):
for s in self.id_map.itervalues():
yield s
def __getitem__(self, key):
return self.id_map[key]
def __len__(self):
return len(self.id_map)
def get(self, key, default=None):
return self.id_map.get(key, default)
def __call__(self, root):
for s in XPath('//w:style')(root):
s = Style(s)
if s.style_id:
self.id_map[s.style_id] = s
# Nuke based_on, link attributes that refer to non-existing/incompatible
# parents
for s in self:
bo = s.based_on
if bo is not None:
p = self.get(bo)
if p is None or p.style_type != s.style_type:
s.based_on = None
link = s.link
if link is not None:
p = self.get(link)
if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}:
s.link = None
# TODO: Document defaults (docDefaults)

View File

@ -9,33 +9,134 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os
from lxml import html
from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)
from calibre.ebooks.docx.container import Container
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
from calibre.ebooks.docx.styles import Styles
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text:
def __init__(self, elem, attr, buf):
self.elem, self.attr, self.buf = elem, attr, buf
def add_elem(self, elem):
setattr(self.elem, self.attr, ''.join(self.buf))
self.elem, self.attr, self.buf = elem, 'tail', []
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None):
self.container = Container(path_or_stream, log=log)
self.log = self.container.log
self.docx = DOCX(path_or_stream, log=log)
self.log = self.docx.log
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
self.body = BODY()
self.styles = Styles()
self.html = HTML(
HEAD(
META(charset='utf-8'),
TITLE('TODO: read from metadata'),
TITLE(self.mi.title or _('Unknown')),
LINK(rel='stylesheet', type='text/css', href='docx.css'),
),
self.body
)
self.html.text='\n\t'
self.html[0].text='\n\t\t'
self.html[0].tail='\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
self.html[1].text = self.html[1].tail = '\n'
lang = canonicalize_lang(self.mi.language)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
self.html.set('lang', lang)
def __call__(self):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
self.read_styles(relationships_by_type)
for top_level in XPath('/w:document/w:body/*')(doc):
if is_tag(top_level, 'w:p'):
p = self.convert_p(top_level)
self.body.append(p)
elif is_tag(top_level, 'w:tbl'):
pass # TODO: tables
elif is_tag(top_level, 'w:sectPr'):
pass # TODO: Last section properties
else:
self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag))
if len(self.body) > 0:
self.body.text = '\n\t'
for child in self.body:
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.write()
def read_styles(self, relationships_by_type):
sname = relationships_by_type.get(STYLES, None)
if sname is None:
name = self.docx.document_name.split('/')
name[-1] = 'styles.xml'
if self.docx.exists(name):
sname = name
if sname is not None:
try:
raw = self.docx.read(sname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw))
def write(self):
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
def convert_p(self, p):
dest = P()
for run in XPath('descendant::w:r')(p):
span = self.convert_run(run)
dest.append(span)
return dest
def convert_run(self, run):
ans = SPAN()
text = Text(ans, 'text', [])
for child in run:
if is_tag(child, 'w:t'):
if not child.text:
continue
space = child.get(XML('space'), None)
if space == 'preserve':
text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
ans.append(text.elem)
else:
text.buf.append(child.text)
elif is_tag(child, 'w:cr'):
text.add_elem(BR())
elif is_tag(child, 'w:br'):
typ = child.get('type', None)
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
else:
br = BR()
text.add_elem(br)
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))
return ans
if __name__ == '__main__':
Convert(sys.argv[-1])()
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
Convert(sys.argv[-1], log=default_log)()