mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
More work on the DOCX input plugin, we can now read block styles
This commit is contained in:
parent
8ff4ff2aa0
commit
159c08b97e
@ -105,6 +105,9 @@ class DOCX(object):
|
||||
name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
|
||||
self.names[name] = f
|
||||
|
||||
def exists(self, name):
|
||||
return name in self.names
|
||||
|
||||
def read(self, name):
|
||||
if hasattr(self, 'zipf'):
|
||||
return self.zipf.open(name).read()
|
||||
@ -149,14 +152,39 @@ class DOCX(object):
|
||||
self.relationships_rmap[target] = typ
|
||||
|
||||
@property
|
||||
def document(self):
|
||||
def document_name(self):
|
||||
name = self.relationships.get(DOCUMENT, None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
|
||||
if not names:
|
||||
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
|
||||
name = names[0]
|
||||
return fromstring(self.read(name))
|
||||
return name
|
||||
|
||||
@property
|
||||
def document(self):
|
||||
return fromstring(self.read(self.document_name))
|
||||
|
||||
@property
|
||||
def document_relationships(self):
|
||||
name = self.document_name
|
||||
base = '/'.join(name.split('/')[:-1])
|
||||
by_id, by_type = {}, {}
|
||||
parts = name.split('/')
|
||||
name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
|
||||
try:
|
||||
raw = self.read(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
root = fromstring(raw)
|
||||
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
||||
target = '/'.join((base, item.get('Target').lstrip('/')))
|
||||
typ = item.get('Type')
|
||||
Id = item.get('Id')
|
||||
by_id[Id] = by_type[typ] = target
|
||||
|
||||
return by_id, by_type
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
|
@ -11,6 +11,7 @@ from lxml.etree import XPath as X
|
||||
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
|
||||
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
|
||||
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
|
||||
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
|
||||
|
||||
namespaces = {
|
||||
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
|
||||
@ -20,6 +21,7 @@ namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||
'w10': 'urn:schemas-microsoft-com:office:word',
|
||||
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||
# Drawing
|
||||
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
|
||||
@ -45,3 +47,18 @@ namespaces = {
|
||||
def XPath(expr):
|
||||
return X(expr, namespaces=namespaces)
|
||||
|
||||
def is_tag(x, q):
|
||||
tag = getattr(x, 'tag', x)
|
||||
ns, name = q.partition(':')[0::2]
|
||||
return '{%s}%s' % (namespaces.get(ns, None), name) == tag
|
||||
|
||||
def barename(x):
|
||||
return x.rpartition('}')[-1]
|
||||
|
||||
def XML(x):
|
||||
return '{%s}%s' % (namespaces['xml'], x)
|
||||
|
||||
def get(x, attr, default=None):
|
||||
ns, name = attr.partition(':')[0::2]
|
||||
return x.attrib.get('{%s}%s' % (namespaces[ns], name), default)
|
||||
|
||||
|
263
src/calibre/ebooks/docx/styles.py
Normal file
263
src/calibre/ebooks/docx/styles.py
Normal file
@ -0,0 +1,263 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.ebooks.docx.names import XPath, get
|
||||
|
||||
class Inherit:
|
||||
pass
|
||||
inherit = Inherit()
|
||||
|
||||
def binary_property(parent, name):
|
||||
vals = XPath('./w:%s')
|
||||
if not vals:
|
||||
return inherit
|
||||
val = get(vals[0], 'w:val', 'on')
|
||||
return True if val in {'on', '1', 'true'} else False
|
||||
|
||||
def simple_color(col):
|
||||
if not col or col == 'auto' or len(col) != 6:
|
||||
return 'black'
|
||||
return '#'+col
|
||||
|
||||
def simple_float(val, mult=1.0):
|
||||
try:
|
||||
return float(val) * mult
|
||||
except (ValueError, TypeError, AttributeError, KeyError):
|
||||
return None
|
||||
|
||||
# Block styles {{{
|
||||
|
||||
LINE_STYLES = { # {{{
|
||||
'basicBlackDashes': 'dashed',
|
||||
'basicBlackDots': 'dotted',
|
||||
'basicBlackSquares': 'dashed',
|
||||
'basicThinLines': 'solid',
|
||||
'dashDotStroked': 'groove',
|
||||
'dashed': 'dashed',
|
||||
'dashSmallGap': 'dashed',
|
||||
'dotDash': 'dashed',
|
||||
'dotDotDash': 'dashed',
|
||||
'dotted': 'dotted',
|
||||
'double': 'double',
|
||||
'inset': 'inset',
|
||||
'nil': 'none',
|
||||
'none': 'none',
|
||||
'outset': 'outset',
|
||||
'single': 'solid',
|
||||
'thick': 'solid',
|
||||
'thickThinLargeGap': 'double',
|
||||
'thickThinMediumGap': 'double',
|
||||
'thickThinSmallGap' : 'double',
|
||||
'thinThickLargeGap': 'double',
|
||||
'thinThickMediumGap': 'double',
|
||||
'thinThickSmallGap': 'double',
|
||||
'thinThickThinLargeGap': 'double',
|
||||
'thinThickThinMediumGap': 'double',
|
||||
'thinThickThinSmallGap': 'double',
|
||||
'threeDEmboss': 'ridge',
|
||||
'threeDEngrave': 'groove',
|
||||
'triple': 'double',
|
||||
} # }}}
|
||||
|
||||
def read_border(border, dest):
|
||||
all_attrs = set()
|
||||
for edge in ('left', 'top', 'right', 'bottom'):
|
||||
vals = {'padding_%s':inherit, 'border_%s_width':inherit,
|
||||
'border_%s_style':inherit, 'border_%s_color':inherit}
|
||||
all_attrs |= {key % edge for key in vals}
|
||||
for elem in XPath('./w:%s' % edge):
|
||||
color = get(elem, 'w:color')
|
||||
if color is not None:
|
||||
vals['border_%s_color'] = simple_color(color)
|
||||
style = get(elem, 'w:val')
|
||||
if style is not None:
|
||||
vals['border_%s_style'] = LINE_STYLES.get(style, 'solid')
|
||||
space = get(elem, 'w:space')
|
||||
if space is not None:
|
||||
try:
|
||||
vals['padding_%s'] = float(space)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
sz = get(elem, 'w:space')
|
||||
if sz is not None:
|
||||
# we dont care about art borders (they are only used for page borders)
|
||||
try:
|
||||
vals['border_%s_width'] = min(96, max(2, float(sz))) * 8
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
for key, val in vals.iteritems():
|
||||
setattr(dest, key % edge, val)
|
||||
|
||||
return all_attrs
|
||||
|
||||
def read_indent(parent, dest):
|
||||
padding_left = padding_right = text_indent = inherit
|
||||
for indent in XPath('./w:ind')(parent):
|
||||
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
|
||||
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
|
||||
if pl is not None:
|
||||
padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt')
|
||||
|
||||
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
|
||||
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
|
||||
if pr is not None:
|
||||
padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt')
|
||||
|
||||
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
|
||||
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
|
||||
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
|
||||
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
|
||||
if ti is not None:
|
||||
text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
|
||||
|
||||
setattr(dest, 'padding_left', padding_left)
|
||||
setattr(dest, 'padding_right', padding_right)
|
||||
setattr(dest, 'text_indent', text_indent)
|
||||
return {'padding_left', 'padding_right', 'text_indent'}
|
||||
|
||||
def read_justification(parent, dest):
|
||||
ans = inherit
|
||||
for jc in XPath('./w:jc[@w:val]')(parent):
|
||||
val = get(jc, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
|
||||
ans = 'justify'
|
||||
if val in {'left', 'center', 'right',}:
|
||||
ans = val
|
||||
setattr(dest, 'text_align', ans)
|
||||
return {'text_align'}
|
||||
|
||||
def read_spacing(parent, dest):
|
||||
padding_top = padding_bottom = line_height = inherit
|
||||
for s in XPath('./w:spacing')(parent):
|
||||
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
|
||||
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
|
||||
if pb is not None:
|
||||
padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt')
|
||||
|
||||
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
|
||||
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
|
||||
if pt is not None:
|
||||
padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt')
|
||||
|
||||
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
|
||||
if l is not None:
|
||||
lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
|
||||
line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
|
||||
|
||||
setattr(dest, 'padding_top', padding_top)
|
||||
setattr(dest, 'padding_bottom', padding_bottom)
|
||||
setattr(dest, 'line_height', line_height)
|
||||
return {'padding_top', 'padding_bottom', 'line_height'}
|
||||
|
||||
def read_direction(parent, dest):
|
||||
ans = inherit
|
||||
for jc in XPath('./w:textFlow[@w:val]')(parent):
|
||||
val = get(jc, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
if 'rl' in val.lower():
|
||||
ans = 'rtl'
|
||||
setattr(dest, 'direction', ans)
|
||||
return {'direction'}
|
||||
|
||||
|
||||
class ParagraphStyle(object):
|
||||
|
||||
border_path = XPath('./w:pBdr')
|
||||
|
||||
def __init__(self, pPr):
|
||||
self.all_properties = set()
|
||||
for p in (
|
||||
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN',
|
||||
'bidi', 'contextualSpacing', 'keepLines', 'keepNext',
|
||||
'mirrorIndents', 'pageBreakBefore', 'snapToGrid',
|
||||
'suppressLineNumbers', 'suppressOverlap', 'topLinePunct',
|
||||
'widowControl', 'wordWrap',
|
||||
):
|
||||
self.all_properties.add(p)
|
||||
setattr(p, binary_property(pPr, p))
|
||||
|
||||
for border in self.border_path(pPr):
|
||||
self.all_properties |= read_border(border, self)
|
||||
|
||||
self.all_properties |= read_indent(pPr, self)
|
||||
self.all_properties |= read_justification(pPr, self)
|
||||
self.all_properties |= read_spacing(pPr, self)
|
||||
self.all_properties |= read_direction(pPr, self)
|
||||
|
||||
# TODO: numPr and outlineLvl
|
||||
# }}}
|
||||
|
||||
class Style(object):
|
||||
|
||||
name_path = XPath('./w:name[@w:val]')
|
||||
based_on_path = XPath('./w:basedOn[@w:val]')
|
||||
link_path = XPath('./w:link[@w:val]')
|
||||
|
||||
def __init__(self, elem):
|
||||
self.style_id = get(elem, 'w:styleId')
|
||||
self.style_type = get(elem, 'w:type')
|
||||
names = self.name_path(elem)
|
||||
self.name = get(names[-1], 'w:val') if names else None
|
||||
based_on = self.based_on_path(elem)
|
||||
self.based_on = get(based_on[0], 'w:val') if based_on else None
|
||||
if self.style_type == 'numbering':
|
||||
self.based_on = None
|
||||
link = self.link_path(elem)
|
||||
self.link = get(link[0], 'w:val') if link else None
|
||||
if self.style_type not in {'paragraph', 'character'}:
|
||||
self.link = None
|
||||
|
||||
|
||||
class Styles(object):
|
||||
|
||||
def __init__(self):
|
||||
self.id_map = OrderedDict()
|
||||
|
||||
def __iter__(self):
|
||||
for s in self.id_map.itervalues():
|
||||
yield s
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.id_map[key]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.id_map)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.id_map.get(key, default)
|
||||
|
||||
def __call__(self, root):
|
||||
for s in XPath('//w:style')(root):
|
||||
s = Style(s)
|
||||
if s.style_id:
|
||||
self.id_map[s.style_id] = s
|
||||
|
||||
# Nuke based_on, link attributes that refer to non-existing/incompatible
|
||||
# parents
|
||||
for s in self:
|
||||
bo = s.based_on
|
||||
if bo is not None:
|
||||
p = self.get(bo)
|
||||
if p is None or p.style_type != s.style_type:
|
||||
s.based_on = None
|
||||
link = s.link
|
||||
if link is not None:
|
||||
p = self.get(link)
|
||||
if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}:
|
||||
s.link = None
|
||||
|
||||
# TODO: Document defaults (docDefaults)
|
||||
|
||||
|
||||
|
@ -9,33 +9,134 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import sys, os
|
||||
|
||||
from lxml import html
|
||||
from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
|
||||
from lxml.html.builder import (
|
||||
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)
|
||||
|
||||
from calibre.ebooks.docx.container import Container
|
||||
from calibre.ebooks.docx.container import DOCX, fromstring
|
||||
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
|
||||
from calibre.ebooks.docx.styles import Styles
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
class Text:
|
||||
|
||||
def __init__(self, elem, attr, buf):
|
||||
self.elem, self.attr, self.buf = elem, attr, buf
|
||||
|
||||
def add_elem(self, elem):
|
||||
setattr(self.elem, self.attr, ''.join(self.buf))
|
||||
self.elem, self.attr, self.buf = elem, 'tail', []
|
||||
|
||||
class Convert(object):
|
||||
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None):
|
||||
self.container = Container(path_or_stream, log=log)
|
||||
self.log = self.container.log
|
||||
self.docx = DOCX(path_or_stream, log=log)
|
||||
self.log = self.docx.log
|
||||
self.dest_dir = dest_dir or os.getcwdu()
|
||||
self.mi = self.docx.metadata
|
||||
self.body = BODY()
|
||||
self.styles = Styles()
|
||||
self.html = HTML(
|
||||
HEAD(
|
||||
META(charset='utf-8'),
|
||||
TITLE('TODO: read from metadata'),
|
||||
TITLE(self.mi.title or _('Unknown')),
|
||||
LINK(rel='stylesheet', type='text/css', href='docx.css'),
|
||||
),
|
||||
self.body
|
||||
)
|
||||
self.html.text='\n\t'
|
||||
self.html[0].text='\n\t\t'
|
||||
self.html[0].tail='\n'
|
||||
for child in self.html[0]:
|
||||
child.tail = '\n\t\t'
|
||||
self.html[0][-1].tail = '\n\t'
|
||||
self.html[1].text = self.html[1].tail = '\n'
|
||||
lang = canonicalize_lang(self.mi.language)
|
||||
if lang and lang != 'und':
|
||||
lang = lang_as_iso639_1(lang)
|
||||
if lang:
|
||||
self.html.set('lang', lang)
|
||||
|
||||
def __call__(self):
|
||||
doc = self.docx.document
|
||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||
self.read_styles(relationships_by_type)
|
||||
for top_level in XPath('/w:document/w:body/*')(doc):
|
||||
if is_tag(top_level, 'w:p'):
|
||||
p = self.convert_p(top_level)
|
||||
self.body.append(p)
|
||||
elif is_tag(top_level, 'w:tbl'):
|
||||
pass # TODO: tables
|
||||
elif is_tag(top_level, 'w:sectPr'):
|
||||
pass # TODO: Last section properties
|
||||
else:
|
||||
self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag))
|
||||
if len(self.body) > 0:
|
||||
self.body.text = '\n\t'
|
||||
for child in self.body:
|
||||
child.tail = '\n\t'
|
||||
self.body[-1].tail = '\n'
|
||||
self.write()
|
||||
|
||||
def read_styles(self, relationships_by_type):
|
||||
sname = relationships_by_type.get(STYLES, None)
|
||||
if sname is None:
|
||||
name = self.docx.document_name.split('/')
|
||||
name[-1] = 'styles.xml'
|
||||
if self.docx.exists(name):
|
||||
sname = name
|
||||
if sname is not None:
|
||||
try:
|
||||
raw = self.docx.read(sname)
|
||||
except KeyError:
|
||||
self.log.warn('Styles %s do not exist' % sname)
|
||||
else:
|
||||
self.styles(fromstring(raw))
|
||||
|
||||
def write(self):
|
||||
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
|
||||
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
def convert_p(self, p):
|
||||
dest = P()
|
||||
for run in XPath('descendant::w:r')(p):
|
||||
span = self.convert_run(run)
|
||||
dest.append(span)
|
||||
|
||||
return dest
|
||||
|
||||
def convert_run(self, run):
|
||||
ans = SPAN()
|
||||
text = Text(ans, 'text', [])
|
||||
|
||||
for child in run:
|
||||
if is_tag(child, 'w:t'):
|
||||
if not child.text:
|
||||
continue
|
||||
space = child.get(XML('space'), None)
|
||||
if space == 'preserve':
|
||||
text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
|
||||
ans.append(text.elem)
|
||||
else:
|
||||
text.buf.append(child.text)
|
||||
elif is_tag(child, 'w:cr'):
|
||||
text.add_elem(BR())
|
||||
elif is_tag(child, 'w:br'):
|
||||
typ = child.get('type', None)
|
||||
if typ in {'column', 'page'}:
|
||||
br = BR(style='page-break-after:always')
|
||||
else:
|
||||
clear = child.get('clear', None)
|
||||
if clear in {'all', 'left', 'right'}:
|
||||
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
|
||||
else:
|
||||
br = BR()
|
||||
text.add_elem(br)
|
||||
if text.buf:
|
||||
setattr(text.elem, text.attr, ''.join(text.buf))
|
||||
return ans
|
||||
|
||||
if __name__ == '__main__':
|
||||
Convert(sys.argv[-1])()
|
||||
from calibre.utils.logging import default_log
|
||||
default_log.filter_level = default_log.DEBUG
|
||||
Convert(sys.argv[-1], log=default_log)()
|
||||
|
Loading…
x
Reference in New Issue
Block a user