From 3a021b5873a156996deb8aa43bdf3e4a404b3b45 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 May 2013 18:02:29 +0530 Subject: [PATCH] DOCX Input: Work on lists --- src/calibre/ebooks/docx/names.py | 9 +- src/calibre/ebooks/docx/numbering.py | 156 +++++++++++++++++++++++++++ src/calibre/ebooks/docx/styles.py | 11 ++ src/calibre/ebooks/docx/to_html.py | 33 ++++-- 4 files changed, 198 insertions(+), 11 deletions(-) create mode 100644 src/calibre/ebooks/docx/numbering.py diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 2b5dcca653..29a7f0eb81 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -8,10 +8,11 @@ __copyright__ = '2013, Kovid Goyal ' from lxml.etree import XPath as X -DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' -DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' -APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' -STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' +DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' +DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' +APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' +STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' +NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', diff --git a/src/calibre/ebooks/docx/numbering.py b/src/calibre/ebooks/docx/numbering.py new file mode 100644 index 0000000000..fc1e65db6a --- /dev/null +++ b/src/calibre/ebooks/docx/numbering.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from calibre.ebooks.docx.block_styles import ParagraphStyle +from calibre.ebooks.docx.char_styles import RunStyle +from calibre.ebooks.docx.names import XPath, get + +STYLE_MAP = { + 'aiueo': 'hiragana', + 'aiueoFullWidth': 'hiragana', + 'hebrew1': 'hebrew', + 'iroha': 'katakana-iroha', + 'irohaFullWidth': 'katakana-iroha', + 'lowerLetter': 'lower-alpha', + 'lowerRoman': 'lower-roman', + 'none': 'none', + 'upperLetter': 'upper-alpha', + 'upperRoman': 'upper-roman', + 'chineseCounting': 'cjk-ideographic', + 'decimalZero': 'decimal-leading-zero', +} + +class Level(object): + + def __init__(self, lvl=None): + self.restart = None + self.start = 0 + self.fmt = 'decimal' + self.para_link = None + self.paragraph_style = self.character_style = None + + if lvl is not None: + self.read_from_xml(lvl) + + def read_from_xml(self, lvl, override=False): + for lr in XPath('./w:lvlRestart[@w:val]')(lvl): + try: + self.restart = int(get(lr, 'w:val')) + except (TypeError, ValueError): + pass + + for lr in XPath('./w:start[@w:val]')(lvl): + try: + self.start = int(get(lr, 'w:val')) + except (TypeError, ValueError): + pass + + lt = None + for lr in XPath('./w:lvlText[@w:val]')(lvl): + lt = get(lr, 'w:val') + + for lr in XPath('./w:numFmt[@w:val]')(lvl): + val = get(lr, 'w:val') + if val == 'bullet': + self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') + else: + self.fmt = STYLE_MAP.get(val, 'decimal') + + for lr in XPath('./w:pStyle[@w:val]')(lvl): + self.para_link = get(lr, 'w:val') + + for pPr in XPath('./w:pPr')(lvl): + ps = ParagraphStyle(pPr) + if self.paragraph_style is None: + self.paragraph_style = ps + else: + self.paragraph_style.update(ps) + + for rPr in XPath('./w:rPr')(lvl): + ps = RunStyle(rPr) + if self.character_style is None: + self.character_style = ps + else: + self.character_style.update(ps) + + def copy(self): + ans = Level() + for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'): + setattr(ans, x, getattr(self, x)) + return ans + +class NumberingDefinition(object): + + def __init__(self, parent=None): + self.levels = {} + if parent is not None: + for lvl in XPath('./w:lvl')(parent): + try: + ilvl = int(get(lvl, 'w:ilvl', 0)) + except (TypeError, ValueError): + ilvl = 0 + self.levels[ilvl] = Level(lvl) + + def copy(self): + ans = NumberingDefinition() + for l, lvl in self.levels.iteritems(): + ans.levels[l] = lvl.copy() + return ans + +class Numbering(object): + + def __init__(self): + self.definitions = {} + self.instances = {} + + def __call__(self, root, styles): + ' Read all numbering style definitions ' + lazy_load = {} + for an in XPath('./w:abstractNum[@w:abstractNumId]')(root): + an_id = get(an, 'w:abstractNumId') + nsl = XPath('./w:numStyleLink[@w:val]')(an) + if nsl: + lazy_load[an_id] = get(nsl[0], 'w:val') + else: + nd = NumberingDefinition(an) + self.definitions[an_id] = nd + + def create_instance(n, definition): + nd = definition.copy() + for lo in XPath('./w:lvlOverride')(n): + ilvl = get(lo, 'w:ilvl') + for lvl in XPath('./w:lvl')(lo)[:1]: + nilvl = get(lvl, 'w:ilvl') + ilvl = nilvl if ilvl is None else ilvl + alvl = nd.levels.get(ilvl, None) + if alvl is None: + alvl = Level() + alvl.read_from_xml(lvl, override=True) + + next_pass = {} + for n in XPath('./w:num[@w:numId]')(root): + an_id = None + num_id = get(n, 'w:numId') + for an in XPath('./w:abstractNumId[@w:val]')(n): + an_id = get(an, 'w:val') + d = self.definitions.get(an_id, None) + if d is None: + next_pass[num_id] = (an_id, n) + continue + self.instances[num_id] = create_instance(n, d) + + numbering_links = styles.numbering_style_links + for an_id, style_link in lazy_load.iteritems(): + num_id = numbering_links[style_link] + self.definitions[an_id] = self.instances[num_id].copy() + + for num_id, (an_id, n) in next_pass.iteritems(): + d = self.definitions.get(an_id, None) + if d is not None: + self.instances[num_id] = create_instance(n, d) + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index 0b77ff5353..a17295aa61 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -52,6 +52,11 @@ class Style(object): else: self.character_style.update(rs) + if self.style_type == 'numbering': + self.numbering_style_link = None + for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem): + self.numbering_style_link = get(x, 'w:val') + def resolve_based_on(self, parent): if parent.paragraph_style is not None: if self.paragraph_style is None: @@ -77,6 +82,7 @@ class Styles(object): self.classes = {} self.counter = Counter() self.default_styles = {} + self.numbering_style_links = {} def __iter__(self): for s in self.id_map.itervalues(): @@ -98,6 +104,8 @@ class Styles(object): self.id_map[s.style_id] = s if s.is_default: self.default_styles[s.style_type] = s + if s.style_type == 'numbering' and s.numbering_style_link: + self.numbering_style_links[s.style_id] = s.numbering_style_link self.default_paragraph_style = self.default_character_style = None @@ -235,6 +243,9 @@ class Styles(object): if obj.tag.endswith('}r'): return self.resolve_run(obj) + def resolve_numbering(self, numbering): + pass # TODO: Implement this + def register(self, css, prefix): h = hash(tuple(css.iteritems())) ans, _ = self.classes.get(h, (None, None)) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 04bb0b5061..7aa0383da6 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -13,8 +13,9 @@ from lxml.html.builder import ( HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR) from calibre.ebooks.docx.container import DOCX, fromstring -from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES +from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES, NUMBERING from calibre.ebooks.docx.styles import Styles, inherit +from calibre.ebooks.docx.numbering import Numbering from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 class Text: @@ -89,12 +90,20 @@ class Convert(object): self.write() def read_styles(self, relationships_by_type): - sname = relationships_by_type.get(STYLES, None) - if sname is None: - name = self.docx.document_name.split('/') - name[-1] = 'styles.xml' - if self.docx.exists(name): - sname = name + + def get_name(rtype, defname): + name = relationships_by_type.get(rtype, None) + if name is None: + cname = self.docx.document_name.split('/') + cname[-1] = defname + if self.docx.exists(cname): + name = name + return name + + nname = get_name(NUMBERING, 'numbering.xml') + sname = get_name(STYLES, 'styles.xml') + numbering = Numbering() + if sname is not None: try: raw = self.docx.read(sname) @@ -103,6 +112,16 @@ class Convert(object): else: self.styles(fromstring(raw)) + if nname is not None: + try: + raw = self.docx.read(nname) + except KeyError: + self.log.warn('Numbering styles %s do not exist' % nname) + else: + numbering(fromstring(raw), self.styles) + + self.styles.resolve_numbering(numbering) + def write(self): raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: