From 159c08b97e14b668d4bbf3b73451fcc40a982f70 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 5 May 2013 21:28:44 +0530 Subject: [PATCH] More work on the DOCX input plugin, we can now read block styles --- src/calibre/ebooks/docx/container.py | 32 +++- src/calibre/ebooks/docx/names.py | 17 ++ src/calibre/ebooks/docx/styles.py | 263 +++++++++++++++++++++++++++ src/calibre/ebooks/docx/to_html.py | 113 +++++++++++- 4 files changed, 417 insertions(+), 8 deletions(-) create mode 100644 src/calibre/ebooks/docx/styles.py diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index cae22e086c..ec0decacef 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -105,6 +105,9 @@ class DOCX(object): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f + def exists(self, name): + return name in self.names + def read(self, name): if hasattr(self, 'zipf'): return self.zipf.open(name).read() @@ -149,14 +152,39 @@ class DOCX(object): self.relationships_rmap[target] = typ @property - def document(self): + def document_name(self): name = self.relationships.get(DOCUMENT, None) if name is None: names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] - return fromstring(self.read(name)) + return name + + @property + def document(self): + return fromstring(self.read(self.document_name)) + + @property + def document_relationships(self): + name = self.document_name + base = '/'.join(name.split('/')[:-1]) + by_id, by_type = {}, {} + parts = name.split('/') + name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels']) + try: + raw = self.read(name) + except KeyError: + pass + else: + root = fromstring(raw) + for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + target = '/'.join((base, item.get('Target').lstrip('/'))) + typ = item.get('Type') + Id = item.get('Id') + by_id[Id] = by_type[typ] = target + + return by_id, by_type @property def metadata(self): diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 9080377b36..2b5dcca653 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -11,6 +11,7 @@ from lxml.etree import XPath as X DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' +STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', @@ -20,6 +21,7 @@ namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'w10': 'urn:schemas-microsoft-com:office:word', 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', + 'xml': 'http://www.w3.org/XML/1998/namespace', # Drawing 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', @@ -45,3 +47,18 @@ namespaces = { def XPath(expr): return X(expr, namespaces=namespaces) +def is_tag(x, q): + tag = getattr(x, 'tag', x) + ns, name = q.partition(':')[0::2] + return '{%s}%s' % (namespaces.get(ns, None), name) == tag + +def barename(x): + return x.rpartition('}')[-1] + +def XML(x): + return '{%s}%s' % (namespaces['xml'], x) + +def get(x, attr, default=None): + ns, name = attr.partition(':')[0::2] + return x.attrib.get('{%s}%s' % (namespaces[ns], name), default) + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py new file mode 100644 index 0000000000..5113f4c551 --- /dev/null +++ b/src/calibre/ebooks/docx/styles.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from collections import OrderedDict + +from calibre.ebooks.docx.names import XPath, get + +class Inherit: + pass +inherit = Inherit() + +def binary_property(parent, name): + vals = XPath('./w:%s') + if not vals: + return inherit + val = get(vals[0], 'w:val', 'on') + return True if val in {'on', '1', 'true'} else False + +def simple_color(col): + if not col or col == 'auto' or len(col) != 6: + return 'black' + return '#'+col + +def simple_float(val, mult=1.0): + try: + return float(val) * mult + except (ValueError, TypeError, AttributeError, KeyError): + return None + +# Block styles {{{ + +LINE_STYLES = { # {{{ + 'basicBlackDashes': 'dashed', + 'basicBlackDots': 'dotted', + 'basicBlackSquares': 'dashed', + 'basicThinLines': 'solid', + 'dashDotStroked': 'groove', + 'dashed': 'dashed', + 'dashSmallGap': 'dashed', + 'dotDash': 'dashed', + 'dotDotDash': 'dashed', + 'dotted': 'dotted', + 'double': 'double', + 'inset': 'inset', + 'nil': 'none', + 'none': 'none', + 'outset': 'outset', + 'single': 'solid', + 'thick': 'solid', + 'thickThinLargeGap': 'double', + 'thickThinMediumGap': 'double', + 'thickThinSmallGap' : 'double', + 'thinThickLargeGap': 'double', + 'thinThickMediumGap': 'double', + 'thinThickSmallGap': 'double', + 'thinThickThinLargeGap': 'double', + 'thinThickThinMediumGap': 'double', + 'thinThickThinSmallGap': 'double', + 'threeDEmboss': 'ridge', + 'threeDEngrave': 'groove', + 'triple': 'double', +} # }}} + +def read_border(border, dest): + all_attrs = set() + for edge in ('left', 'top', 'right', 'bottom'): + vals = {'padding_%s':inherit, 'border_%s_width':inherit, + 'border_%s_style':inherit, 'border_%s_color':inherit} + all_attrs |= {key % edge for key in vals} + for elem in XPath('./w:%s' % edge): + color = get(elem, 'w:color') + if color is not None: + vals['border_%s_color'] = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + vals['border_%s_style'] = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + vals['padding_%s'] = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:space') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + vals['border_%s_width'] = min(96, max(2, float(sz))) * 8 + except (ValueError, TypeError): + pass + + for key, val in vals.iteritems(): + setattr(dest, key % edge, val) + + return all_attrs + +def read_indent(parent, dest): + padding_left = padding_right = text_indent = inherit + for indent in XPath('./w:ind')(parent): + l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') + pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None + if pl is not None: + padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt') + + r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') + pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None + if pr is not None: + padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt') + + h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') + fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') + ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else + simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) + if ti is not None: + text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') + + setattr(dest, 'padding_left', padding_left) + setattr(dest, 'padding_right', padding_right) + setattr(dest, 'text_indent', text_indent) + return {'padding_left', 'padding_right', 'text_indent'} + +def read_justification(parent, dest): + ans = inherit + for jc in XPath('./w:jc[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val: + ans = 'justify' + if val in {'left', 'center', 'right',}: + ans = val + setattr(dest, 'text_align', ans) + return {'text_align'} + +def read_spacing(parent, dest): + padding_top = padding_bottom = line_height = inherit + for s in XPath('./w:spacing')(parent): + a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing') + pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None + if pb is not None: + padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt') + + b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing') + pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None + if pt is not None: + padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt') + + l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') + if l is not None: + lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0) + line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '') + + setattr(dest, 'padding_top', padding_top) + setattr(dest, 'padding_bottom', padding_bottom) + setattr(dest, 'line_height', line_height) + return {'padding_top', 'padding_bottom', 'line_height'} + +def read_direction(parent, dest): + ans = inherit + for jc in XPath('./w:textFlow[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if 'rl' in val.lower(): + ans = 'rtl' + setattr(dest, 'direction', ans) + return {'direction'} + + +class ParagraphStyle(object): + + border_path = XPath('./w:pBdr') + + def __init__(self, pPr): + self.all_properties = set() + for p in ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', + 'bidi', 'contextualSpacing', 'keepLines', 'keepNext', + 'mirrorIndents', 'pageBreakBefore', 'snapToGrid', + 'suppressLineNumbers', 'suppressOverlap', 'topLinePunct', + 'widowControl', 'wordWrap', + ): + self.all_properties.add(p) + setattr(p, binary_property(pPr, p)) + + for border in self.border_path(pPr): + self.all_properties |= read_border(border, self) + + self.all_properties |= read_indent(pPr, self) + self.all_properties |= read_justification(pPr, self) + self.all_properties |= read_spacing(pPr, self) + self.all_properties |= read_direction(pPr, self) + + # TODO: numPr and outlineLvl +# }}} + +class Style(object): + + name_path = XPath('./w:name[@w:val]') + based_on_path = XPath('./w:basedOn[@w:val]') + link_path = XPath('./w:link[@w:val]') + + def __init__(self, elem): + self.style_id = get(elem, 'w:styleId') + self.style_type = get(elem, 'w:type') + names = self.name_path(elem) + self.name = get(names[-1], 'w:val') if names else None + based_on = self.based_on_path(elem) + self.based_on = get(based_on[0], 'w:val') if based_on else None + if self.style_type == 'numbering': + self.based_on = None + link = self.link_path(elem) + self.link = get(link[0], 'w:val') if link else None + if self.style_type not in {'paragraph', 'character'}: + self.link = None + + +class Styles(object): + + def __init__(self): + self.id_map = OrderedDict() + + def __iter__(self): + for s in self.id_map.itervalues(): + yield s + + def __getitem__(self, key): + return self.id_map[key] + + def __len__(self): + return len(self.id_map) + + def get(self, key, default=None): + return self.id_map.get(key, default) + + def __call__(self, root): + for s in XPath('//w:style')(root): + s = Style(s) + if s.style_id: + self.id_map[s.style_id] = s + + # Nuke based_on, link attributes that refer to non-existing/incompatible + # parents + for s in self: + bo = s.based_on + if bo is not None: + p = self.get(bo) + if p is None or p.style_type != s.style_type: + s.based_on = None + link = s.link + if link is not None: + p = self.get(link) + if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}: + s.link = None + + # TODO: Document defaults (docDefaults) + + + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index b2a5de4691..f0e2c6385d 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -9,33 +9,134 @@ __copyright__ = '2013, Kovid Goyal ' import sys, os from lxml import html -from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META) +from lxml.html.builder import ( + HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR) -from calibre.ebooks.docx.container import Container +from calibre.ebooks.docx.container import DOCX, fromstring +from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES +from calibre.ebooks.docx.styles import Styles +from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 + +class Text: + + def __init__(self, elem, attr, buf): + self.elem, self.attr, self.buf = elem, attr, buf + + def add_elem(self, elem): + setattr(self.elem, self.attr, ''.join(self.buf)) + self.elem, self.attr, self.buf = elem, 'tail', [] class Convert(object): def __init__(self, path_or_stream, dest_dir=None, log=None): - self.container = Container(path_or_stream, log=log) - self.log = self.container.log + self.docx = DOCX(path_or_stream, log=log) + self.log = self.docx.log self.dest_dir = dest_dir or os.getcwdu() + self.mi = self.docx.metadata self.body = BODY() + self.styles = Styles() self.html = HTML( HEAD( META(charset='utf-8'), - TITLE('TODO: read from metadata'), + TITLE(self.mi.title or _('Unknown')), LINK(rel='stylesheet', type='text/css', href='docx.css'), ), self.body ) + self.html.text='\n\t' + self.html[0].text='\n\t\t' + self.html[0].tail='\n' + for child in self.html[0]: + child.tail = '\n\t\t' + self.html[0][-1].tail = '\n\t' + self.html[1].text = self.html[1].tail = '\n' + lang = canonicalize_lang(self.mi.language) + if lang and lang != 'und': + lang = lang_as_iso639_1(lang) + if lang: + self.html.set('lang', lang) def __call__(self): + doc = self.docx.document + relationships_by_id, relationships_by_type = self.docx.document_relationships + self.read_styles(relationships_by_type) + for top_level in XPath('/w:document/w:body/*')(doc): + if is_tag(top_level, 'w:p'): + p = self.convert_p(top_level) + self.body.append(p) + elif is_tag(top_level, 'w:tbl'): + pass # TODO: tables + elif is_tag(top_level, 'w:sectPr'): + pass # TODO: Last section properties + else: + self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag)) + if len(self.body) > 0: + self.body.text = '\n\t' + for child in self.body: + child.tail = '\n\t' + self.body[-1].tail = '\n' self.write() + def read_styles(self, relationships_by_type): + sname = relationships_by_type.get(STYLES, None) + if sname is None: + name = self.docx.document_name.split('/') + name[-1] = 'styles.xml' + if self.docx.exists(name): + sname = name + if sname is not None: + try: + raw = self.docx.read(sname) + except KeyError: + self.log.warn('Styles %s do not exist' % sname) + else: + self.styles(fromstring(raw)) + def write(self): raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) + def convert_p(self, p): + dest = P() + for run in XPath('descendant::w:r')(p): + span = self.convert_run(run) + dest.append(span) + + return dest + + def convert_run(self, run): + ans = SPAN() + text = Text(ans, 'text', []) + + for child in run: + if is_tag(child, 'w:t'): + if not child.text: + continue + space = child.get(XML('space'), None) + if space == 'preserve': + text.add_elem(SPAN(child.text, style="whitespace:pre-wrap")) + ans.append(text.elem) + else: + text.buf.append(child.text) + elif is_tag(child, 'w:cr'): + text.add_elem(BR()) + elif is_tag(child, 'w:br'): + typ = child.get('type', None) + if typ in {'column', 'page'}: + br = BR(style='page-break-after:always') + else: + clear = child.get('clear', None) + if clear in {'all', 'left', 'right'}: + br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) + else: + br = BR() + text.add_elem(br) + if text.buf: + setattr(text.elem, text.attr, ''.join(text.buf)) + return ans + if __name__ == '__main__': - Convert(sys.argv[-1])() + from calibre.utils.logging import default_log + default_log.filter_level = default_log.DEBUG + Convert(sys.argv[-1], log=default_log)()