More work on the DOCX input plugin, we can now read block styles

2025-08-11 09:13:57 -04:00 · 2013-05-05 21:28:44 +05:30 · 2013-05-05 21:28:44 +05:30 · 159c08b97e
commit 159c08b97e
parent 8ff4ff2aa0
4 changed files with 417 additions and 8 deletions
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -105,6 +105,9 @@ class DOCX(object):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f

+    def exists(self, name):
+        return name in self.names
+
    def read(self, name):
        if hasattr(self, 'zipf'):
            return self.zipf.open(name).read()
@ -149,14 +152,39 @@ class DOCX(object):
            self.relationships_rmap[target] = typ

    @property
-    def document(self):
+    def document_name(self):
        name = self.relationships.get(DOCUMENT, None)
        if name is None:
            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
            if not names:
                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
            name = names[0]
-        return fromstring(self.read(name))
+        return name
+
+    @property
+    def document(self):
+        return fromstring(self.read(self.document_name))
+
+    @property
+    def document_relationships(self):
+        name = self.document_name
+        base = '/'.join(name.split('/')[:-1])
+        by_id, by_type = {}, {}
+        parts = name.split('/')
+        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
+        try:
+            raw = self.read(name)
+        except KeyError:
+            pass
+        else:
+            root = fromstring(raw)
+            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+                target = '/'.join((base, item.get('Target').lstrip('/')))
+                typ = item.get('Type')
+                Id = item.get('Id')
+                by_id[Id] = by_type[typ] = target
+
+        return by_id, by_type

    @property
    def metadata(self):
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -11,6 +11,7 @@ from lxml.etree import XPath as X
 DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
 DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
 APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
+STYLES   = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'

 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
@ -20,6 +21,7 @@ namespaces = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'w10': 'urn:schemas-microsoft-com:office:word',
    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    'xml': 'http://www.w3.org/XML/1998/namespace',
    # Drawing
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
@ -45,3 +47,18 @@ namespaces = {
 def XPath(expr):
    return X(expr, namespaces=namespaces)

+def is_tag(x, q):
+    tag = getattr(x, 'tag', x)
+    ns, name = q.partition(':')[0::2]
+    return '{%s}%s' % (namespaces.get(ns, None), name) == tag
+
+def barename(x):
+    return x.rpartition('}')[-1]
+
+def XML(x):
+    return '{%s}%s' % (namespaces['xml'], x)
+
+def get(x, attr, default=None):
+    ns, name = attr.partition(':')[0::2]
+    return x.attrib.get('{%s}%s' % (namespaces[ns], name), default)
+
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -0,0 +1,263 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+
+from calibre.ebooks.docx.names import XPath, get
+
+class Inherit:
+    pass
+inherit = Inherit()
+
+def binary_property(parent, name):
+    vals = XPath('./w:%s')
+    if not vals:
+        return inherit
+    val = get(vals[0], 'w:val', 'on')
+    return True if val in {'on', '1', 'true'} else False
+
+def simple_color(col):
+    if not col or col == 'auto' or len(col) != 6:
+        return 'black'
+    return '#'+col
+
+def simple_float(val, mult=1.0):
+    try:
+        return float(val) * mult
+    except (ValueError, TypeError, AttributeError, KeyError):
+        return None
+
+# Block styles {{{
+
+LINE_STYLES = {  # {{{
+    'basicBlackDashes': 'dashed',
+    'basicBlackDots': 'dotted',
+    'basicBlackSquares': 'dashed',
+    'basicThinLines': 'solid',
+    'dashDotStroked': 'groove',
+    'dashed': 'dashed',
+    'dashSmallGap': 'dashed',
+    'dotDash': 'dashed',
+    'dotDotDash': 'dashed',
+    'dotted': 'dotted',
+    'double': 'double',
+    'inset': 'inset',
+    'nil': 'none',
+    'none': 'none',
+    'outset': 'outset',
+    'single': 'solid',
+    'thick': 'solid',
+    'thickThinLargeGap': 'double',
+    'thickThinMediumGap': 'double',
+    'thickThinSmallGap' : 'double',
+    'thinThickLargeGap': 'double',
+    'thinThickMediumGap': 'double',
+    'thinThickSmallGap': 'double',
+    'thinThickThinLargeGap': 'double',
+    'thinThickThinMediumGap': 'double',
+    'thinThickThinSmallGap': 'double',
+    'threeDEmboss': 'ridge',
+    'threeDEngrave': 'groove',
+    'triple': 'double',
+}  # }}}
+
+def read_border(border, dest):
+    all_attrs = set()
+    for edge in ('left', 'top', 'right', 'bottom'):
+        vals = {'padding_%s':inherit, 'border_%s_width':inherit,
+                'border_%s_style':inherit, 'border_%s_color':inherit}
+        all_attrs |= {key % edge for key in vals}
+        for elem in XPath('./w:%s' % edge):
+            color = get(elem, 'w:color')
+            if color is not None:
+                vals['border_%s_color'] = simple_color(color)
+            style = get(elem, 'w:val')
+            if style is not None:
+                vals['border_%s_style'] = LINE_STYLES.get(style, 'solid')
+            space = get(elem, 'w:space')
+            if space is not None:
+                try:
+                    vals['padding_%s'] = float(space)
+                except (ValueError, TypeError):
+                    pass
+            sz = get(elem, 'w:space')
+            if sz is not None:
+                # we dont care about art borders (they are only used for page borders)
+                try:
+                    vals['border_%s_width'] = min(96, max(2, float(sz))) * 8
+                except (ValueError, TypeError):
+                    pass
+
+        for key, val in vals.iteritems():
+            setattr(dest, key % edge, val)
+
+    return all_attrs
+
+def read_indent(parent, dest):
+    padding_left = padding_right = text_indent = inherit
+    for indent in XPath('./w:ind')(parent):
+        l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
+        pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
+        if pl is not None:
+            padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt')
+
+        r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
+        pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
+        if pr is not None:
+            padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt')
+
+        h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
+        fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
+        ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
+              simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
+        if ti is not None:
+            text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
+
+    setattr(dest, 'padding_left', padding_left)
+    setattr(dest, 'padding_right', padding_right)
+    setattr(dest, 'text_indent', text_indent)
+    return {'padding_left', 'padding_right', 'text_indent'}
+
+def read_justification(parent, dest):
+    ans = inherit
+    for jc in XPath('./w:jc[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
+            ans = 'justify'
+        if val in {'left', 'center', 'right',}:
+            ans = val
+    setattr(dest, 'text_align', ans)
+    return {'text_align'}
+
+def read_spacing(parent, dest):
+    padding_top = padding_bottom = line_height = inherit
+    for s in XPath('./w:spacing')(parent):
+        a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
+        pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
+        if pb is not None:
+            padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt')
+
+        b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
+        pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
+        if pt is not None:
+            padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt')
+
+        l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
+        if l is not None:
+            lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
+            line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
+
+    setattr(dest, 'padding_top', padding_top)
+    setattr(dest, 'padding_bottom', padding_bottom)
+    setattr(dest, 'line_height', line_height)
+    return {'padding_top', 'padding_bottom', 'line_height'}
+
+def read_direction(parent, dest):
+    ans = inherit
+    for jc in XPath('./w:textFlow[@w:val]')(parent):
+        val = get(jc, 'w:val')
+        if not val:
+            continue
+        if 'rl' in val.lower():
+            ans = 'rtl'
+    setattr(dest, 'direction', ans)
+    return {'direction'}
+
+
+class ParagraphStyle(object):
+
+    border_path = XPath('./w:pBdr')
+
+    def __init__(self, pPr):
+        self.all_properties = set()
+        for p in (
+            'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN',
+            'bidi', 'contextualSpacing', 'keepLines', 'keepNext',
+            'mirrorIndents', 'pageBreakBefore', 'snapToGrid',
+            'suppressLineNumbers', 'suppressOverlap', 'topLinePunct',
+            'widowControl', 'wordWrap',
+        ):
+            self.all_properties.add(p)
+            setattr(p, binary_property(pPr, p))
+
+        for border in self.border_path(pPr):
+            self.all_properties |= read_border(border, self)
+
+        self.all_properties |= read_indent(pPr, self)
+        self.all_properties |= read_justification(pPr, self)
+        self.all_properties |= read_spacing(pPr, self)
+        self.all_properties |= read_direction(pPr, self)
+
+        # TODO: numPr and outlineLvl
+# }}}
+
+class Style(object):
+
+    name_path = XPath('./w:name[@w:val]')
+    based_on_path = XPath('./w:basedOn[@w:val]')
+    link_path = XPath('./w:link[@w:val]')
+
+    def __init__(self, elem):
+        self.style_id = get(elem, 'w:styleId')
+        self.style_type = get(elem, 'w:type')
+        names = self.name_path(elem)
+        self.name = get(names[-1], 'w:val') if names else None
+        based_on = self.based_on_path(elem)
+        self.based_on = get(based_on[0], 'w:val') if based_on else None
+        if self.style_type == 'numbering':
+            self.based_on = None
+        link = self.link_path(elem)
+        self.link = get(link[0], 'w:val') if link else None
+        if self.style_type not in {'paragraph', 'character'}:
+            self.link = None
+
+
+class Styles(object):
+
+    def __init__(self):
+        self.id_map = OrderedDict()
+
+    def __iter__(self):
+        for s in self.id_map.itervalues():
+            yield s
+
+    def __getitem__(self, key):
+        return self.id_map[key]
+
+    def __len__(self):
+        return len(self.id_map)
+
+    def get(self, key, default=None):
+        return self.id_map.get(key, default)
+
+    def __call__(self, root):
+        for s in XPath('//w:style')(root):
+            s = Style(s)
+            if s.style_id:
+                self.id_map[s.style_id] = s
+
+        # Nuke based_on, link attributes that refer to non-existing/incompatible
+        # parents
+        for s in self:
+            bo = s.based_on
+            if bo is not None:
+                p = self.get(bo)
+                if p is None or p.style_type != s.style_type:
+                    s.based_on = None
+            link = s.link
+            if link is not None:
+                p = self.get(link)
+                if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}:
+                    s.link = None
+
+        # TODO: Document defaults (docDefaults)
+
+
+
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -9,33 +9,134 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys, os

 from lxml import html
-from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
+from lxml.html.builder import (
+    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR)

-from calibre.ebooks.docx.container import Container
+from calibre.ebooks.docx.container import DOCX, fromstring
+from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
+from calibre.ebooks.docx.styles import Styles
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+
+class Text:
+
+    def __init__(self, elem, attr, buf):
+        self.elem, self.attr, self.buf = elem, attr, buf
+
+    def add_elem(self, elem):
+        setattr(self.elem, self.attr, ''.join(self.buf))
+        self.elem, self.attr, self.buf = elem, 'tail', []

 class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None):
-        self.container = Container(path_or_stream, log=log)
-        self.log = self.container.log
+        self.docx = DOCX(path_or_stream, log=log)
+        self.log = self.docx.log
        self.dest_dir = dest_dir or os.getcwdu()
+        self.mi = self.docx.metadata
        self.body = BODY()
+        self.styles = Styles()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
-                TITLE('TODO: read from metadata'),
+                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
+        self.html.text='\n\t'
+        self.html[0].text='\n\t\t'
+        self.html[0].tail='\n'
+        for child in self.html[0]:
+            child.tail = '\n\t\t'
+        self.html[0][-1].tail = '\n\t'
+        self.html[1].text = self.html[1].tail = '\n'
+        lang = canonicalize_lang(self.mi.language)
+        if lang and lang != 'und':
+            lang = lang_as_iso639_1(lang)
+            if lang:
+                self.html.set('lang', lang)

    def __call__(self):
+        doc = self.docx.document
+        relationships_by_id, relationships_by_type = self.docx.document_relationships
+        self.read_styles(relationships_by_type)
+        for top_level in XPath('/w:document/w:body/*')(doc):
+            if is_tag(top_level, 'w:p'):
+                p = self.convert_p(top_level)
+                self.body.append(p)
+            elif is_tag(top_level, 'w:tbl'):
+                pass  # TODO: tables
+            elif is_tag(top_level, 'w:sectPr'):
+                pass  # TODO: Last section properties
+            else:
+                self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag))
+        if len(self.body) > 0:
+            self.body.text = '\n\t'
+            for child in self.body:
+                child.tail = '\n\t'
+            self.body[-1].tail = '\n'
        self.write()

+    def read_styles(self, relationships_by_type):
+        sname = relationships_by_type.get(STYLES, None)
+        if sname is None:
+            name = self.docx.document_name.split('/')
+            name[-1] = 'styles.xml'
+            if self.docx.exists(name):
+                sname = name
+        if sname is not None:
+            try:
+                raw = self.docx.read(sname)
+            except KeyError:
+                self.log.warn('Styles %s do not exist' % sname)
+            else:
+                self.styles(fromstring(raw))
+
    def write(self):
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)

+    def convert_p(self, p):
+        dest = P()
+        for run in XPath('descendant::w:r')(p):
+            span = self.convert_run(run)
+            dest.append(span)
+
+        return dest
+
+    def convert_run(self, run):
+        ans = SPAN()
+        text = Text(ans, 'text', [])
+
+        for child in run:
+            if is_tag(child, 'w:t'):
+                if not child.text:
+                    continue
+                space = child.get(XML('space'), None)
+                if space == 'preserve':
+                    text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
+                    ans.append(text.elem)
+                else:
+                    text.buf.append(child.text)
+            elif is_tag(child, 'w:cr'):
+                text.add_elem(BR())
+            elif is_tag(child, 'w:br'):
+                typ = child.get('type', None)
+                if typ in {'column', 'page'}:
+                    br = BR(style='page-break-after:always')
+                else:
+                    clear = child.get('clear', None)
+                    if clear in {'all', 'left', 'right'}:
+                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
+                    else:
+                        br = BR()
+                text.add_elem(br)
+        if text.buf:
+            setattr(text.elem, text.attr, ''.join(text.buf))
+        return ans
+
 if __name__ == '__main__':
-    Convert(sys.argv[-1])()
+    from calibre.utils.logging import default_log
+    default_log.filter_level = default_log.DEBUG
+    Convert(sys.argv[-1], log=default_log)()