From ed422c7b0fb17ee6b4c7b45106d6293538e9a14f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 May 2013 18:39:33 +0530 Subject: [PATCH] DOCX Input: Lists work --- src/calibre/ebooks/docx/block_styles.py | 17 ++- src/calibre/ebooks/docx/numbering.py | 144 +++++++++++++++++++++++- src/calibre/ebooks/docx/styles.py | 33 +++++- src/calibre/ebooks/docx/to_html.py | 23 +++- 4 files changed, 200 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/docx/block_styles.py b/src/calibre/ebooks/docx/block_styles.py index 1770569b61..b501580042 100644 --- a/src/calibre/ebooks/docx/block_styles.py +++ b/src/calibre/ebooks/docx/block_styles.py @@ -175,6 +175,20 @@ def read_shd(parent, dest): if val: ans = simple_color(val, auto='transparent') setattr(dest, 'background_color', ans) + +def read_numbering(parent, dest): + lvl = num_id = None + for np in XPath('./w:numPr')(parent): + for ilvl in XPath('./w:ilvl[@w:val]')(np): + try: + lvl = int(get(ilvl, 'w:val')) + except (ValueError, TypeError): + pass + for num in XPath('./w:numId[@w:val]')(np): + num_id = get(num, 'w:val') + val = (num_id, lvl) if num_id is not None or lvl is not None else inherit + setattr(dest, 'numbering', val) + # }}} class ParagraphStyle(object): @@ -194,6 +208,7 @@ class ParagraphStyle(object): # Misc. 'text_indent', 'text_align', 'line_height', 'direction', 'background_color', + 'numbering', ) def __init__(self, pPr=None): @@ -210,7 +225,7 @@ class ParagraphStyle(object): ): setattr(self, p, binary_property(pPr, p)) - for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): + for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering'): f = globals()['read_%s' % x] f(pPr, self) diff --git a/src/calibre/ebooks/docx/numbering.py b/src/calibre/ebooks/docx/numbering.py index fc1e65db6a..8693e2a9a1 100644 --- a/src/calibre/ebooks/docx/numbering.py +++ b/src/calibre/ebooks/docx/numbering.py @@ -6,6 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import re +from collections import Counter + +from lxml.html.builder import OL, UL, SPAN + from calibre.ebooks.docx.block_styles import ParagraphStyle from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.names import XPath, get @@ -33,10 +38,26 @@ class Level(object): self.fmt = 'decimal' self.para_link = None self.paragraph_style = self.character_style = None + self.is_numbered = False + self.num_template = None if lvl is not None: self.read_from_xml(lvl) + def copy(self): + ans = Level() + for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'): + setattr(ans, x, getattr(self, x)) + return ans + + def format_template(self, counter, ilvl): + def sub(m): + x = int(m.group(1)) - 1 + if x > ilvl or x not in counter: + return '' + return '%d' % (counter[x] - (0 if x == ilvl else 1)) + return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0' + def read_from_xml(self, lvl, override=False): for lr in XPath('./w:lvlRestart[@w:val]')(lvl): try: @@ -57,9 +78,13 @@ class Level(object): for lr in XPath('./w:numFmt[@w:val]')(lvl): val = get(lr, 'w:val') if val == 'bullet': + self.is_numbered = False self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') else: + self.is_numbered = True self.fmt = STYLE_MAP.get(val, 'decimal') + if lt and re.match(r'%\d+\.$', lt) is None: + self.num_template = lt for lr in XPath('./w:pStyle[@w:val]')(lvl): self.para_link = get(lr, 'w:val') @@ -78,12 +103,6 @@ class Level(object): else: self.character_style.update(ps) - def copy(self): - ans = Level() - for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'): - setattr(ans, x, getattr(self, x)) - return ans - class NumberingDefinition(object): def __init__(self, parent=None): @@ -107,6 +126,7 @@ class Numbering(object): def __init__(self): self.definitions = {} self.instances = {} + self.counters = {} def __call__(self, root, styles): ' Read all numbering style definitions ' @@ -131,6 +151,7 @@ class Numbering(object): if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) + return nd next_pass = {} for n in XPath('./w:num[@w:numId]')(root): @@ -154,3 +175,114 @@ class Numbering(object): if d is not None: self.instances[num_id] = create_instance(n, d) + for num_id, d in self.instances.iteritems(): + self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels}) + + def get_pstyle(self, num_id, style_id): + d = self.instances.get(num_id, None) + if d is not None: + for ilvl, lvl in d.levels.iteritems(): + if lvl.para_link == style_id: + return ilvl + + def get_para_style(self, num_id, lvl): + d = self.instances.get(num_id, None) + if d is not None: + lvl = d.levels.get(lvl, None) + return getattr(lvl, 'paragraph_style', None) + + def update_counter(self, counter, levelnum, levels): + counter[levelnum] += 1 + for ilvl, lvl in levels.iteritems(): + restart = lvl.restart + if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1: + counter[ilvl] = lvl.start + + def apply_markup(self, items, body, styles, object_map): + for p, num_id, ilvl in items: + d = self.instances.get(num_id, None) + if d is not None: + lvl = d.levels.get(ilvl, None) + if lvl is not None: + counter = self.counters[num_id] + p.tag = 'li' + p.set('value', '%s' % counter[ilvl]) + p.set('list-lvl', str(ilvl)) + p.set('list-id', num_id) + if lvl.num_template is not None: + val = lvl.format_template(counter, ilvl) + p.set('list-template', val) + self.update_counter(counter, ilvl, d.levels) + + def commit(current_run): + if not current_run: + return + start = current_run[0] + parent = start.getparent() + idx = parent.index(start) + + d = self.instances[start.get('list-id')] + ilvl = int(start.get('list-lvl')) + lvl = d.levels[ilvl] + lvlid = start.get('list-id') + start.get('list-lvl') + wrap = (OL if lvl.is_numbered else UL)('\n\t') + has_template = 'list-template' in start.attrib + if has_template: + wrap.set('lvlid', lvlid) + else: + wrap.set('class', styles.register({'list-style-type': lvl.fmt}, 'list')) + parent.insert(idx, wrap) + last_val = None + for child in current_run: + wrap.append(child) + child.tail = '\n\t' + if has_template: + span = SPAN() + span.text = child.text + child.text = None + for gc in child: + span.append(gc) + child.append(span) + span = SPAN(child.get('list-template')) + child.insert(0, span) + for attr in ('list-lvl', 'list-id', 'list-template'): + child.attrib.pop(attr, None) + val = int(child.get('value')) + if last_val == val - 1 or wrap.tag == 'ul': + child.attrib.pop('value') + last_val = val + current_run[-1].tail = '\n' + del current_run[:] + + parents = set() + for child in body.iterdescendants('li'): + parents.add(child.getparent()) + + for parent in parents: + current_run = [] + for child in parent: + if child.tag == 'li': + if current_run: + last = current_run[-1] + if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')): + commit(current_run) + current_run.append(child) + else: + commit(current_run) + commit(current_run) + + for wrap in body.xpath('//ol[@lvlid]'): + wrap.attrib.pop('lvlid') + wrap.tag = 'div' + for i, li in enumerate(wrap.iterchildren('li')): + li.tag = 'div' + li.attrib.pop('value', None) + li.set('style', 'display:table-row') + obj = object_map[li] + bs = styles.para_cache[obj] + if i == 0: + wrap.set('style', 'display:table; margin-left: %s' % (bs.css.get('margin-left', 0))) + bs.css.pop('margin-left', None) + for child in li: + child.set('style', 'display:table-cell') + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index a17295aa61..44ae2cea89 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -198,8 +198,19 @@ class Styles(object): if default_para.character_style is not None: self.para_char_cache[p] = default_para.character_style + is_numbering = direct_formatting.numbering is not inherit + if is_numbering: + num_id, lvl = direct_formatting.numbering + if num_id is not None: + p.set('calibre_num_id', '%s:%s' % (lvl, num_id)) + if num_id is not None and lvl is not None: + ps = self.numbering.get_para_style(num_id, lvl) + if ps is not None: + parent_styles.append(ps) + for attr in ans.all_properties: - setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr)) + if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists + setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr)) return ans def resolve_run(self, r): @@ -244,10 +255,20 @@ class Styles(object): return self.resolve_run(obj) def resolve_numbering(self, numbering): - pass # TODO: Implement this + # When a numPr element appears inside a paragraph style, the lvl info + # must be discarder and pStyle used instead. + self.numbering = numbering + for style in self: + ps = style.paragraph_style + if ps is not None and ps.numbering is not inherit: + lvl = numbering.get_pstyle(ps.numbering[0], style.style_id) + if lvl is None: + ps.numbering = inherit + else: + ps.numbering = (ps.numbering[0], lvl) def register(self, css, prefix): - h = hash(tuple(css.iteritems())) + h = hash(frozenset(css.iteritems())) ans, _ = self.classes.get(h, (None, None)) if ans is None: self.counter[prefix] += 1 @@ -266,13 +287,15 @@ class Styles(object): self.register(css, 'text') def class_name(self, css): - h = hash(tuple(css.iteritems())) + h = hash(frozenset(css.iteritems())) return self.classes.get(h, (None, None))[0] def generate_css(self): prefix = textwrap.dedent( '''\ - p { margin: 0; padding: 0; text-indent: 1.5em } + p { text-indent: 1.5em } + + ul, ol, p { margin: 0; padding: 0 } ''') ans = [] diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 7aa0383da6..8cd79074e3 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -7,6 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' import sys, os, re +from collections import OrderedDict from lxml import html from lxml.html.builder import ( @@ -36,7 +37,7 @@ class Convert(object): self.mi = self.docx.metadata self.body = BODY() self.styles = Styles() - self.object_map = {} + self.object_map = OrderedDict() self.html = HTML( HEAD( META(charset='utf-8'), @@ -72,6 +73,19 @@ class Convert(object): pass # TODO: Last section properties else: self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag)) + + numbered = [] + for html_obj, obj in self.object_map.iteritems(): + raw = obj.get('calibre_num_id', None) + if raw is not None: + lvl, num_id = raw.partition(':')[0::2] + try: + lvl = int(lvl) + except (TypeError, ValueError): + lvl = 0 + numbered.append((html_obj, num_id, lvl)) + self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map) + if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: @@ -102,7 +116,7 @@ class Convert(object): nname = get_name(NUMBERING, 'numbering.xml') sname = get_name(STYLES, 'styles.xml') - numbering = Numbering() + numbering = self.numbering = Numbering() if sname is not None: try: @@ -133,6 +147,7 @@ class Convert(object): def convert_p(self, p): dest = P() + self.object_map[dest] = p style = self.styles.resolve_paragraph(p) for run in XPath('descendant::w:r')(p): span = self.convert_run(run) @@ -173,7 +188,6 @@ class Convert(object): wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) - self.object_map[dest] = p return dest def wrap_elems(self, elems, wrapper): @@ -188,7 +202,7 @@ class Convert(object): def convert_run(self, run): ans = SPAN() - ans.run = run + self.object_map[ans] = run text = Text(ans, 'text', []) for child in run: @@ -224,7 +238,6 @@ class Convert(object): ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup' if style.lang is not inherit: ans.lang = style.lang - self.object_map[ans] = run return ans if __name__ == '__main__':