DOCX Input: Lists work

This commit is contained in:
Kovid Goyal 2013-05-13 18:39:33 +05:30
parent cc223574d0
commit ed422c7b0f
4 changed files with 200 additions and 17 deletions

View File

@ -175,6 +175,20 @@ def read_shd(parent, dest):
if val: if val:
ans = simple_color(val, auto='transparent') ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans) setattr(dest, 'background_color', ans)
def read_numbering(parent, dest):
lvl = num_id = None
for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np):
try:
lvl = int(get(ilvl, 'w:val'))
except (ValueError, TypeError):
pass
for num in XPath('./w:numId[@w:val]')(np):
num_id = get(num, 'w:val')
val = (num_id, lvl) if num_id is not None or lvl is not None else inherit
setattr(dest, 'numbering', val)
# }}} # }}}
class ParagraphStyle(object): class ParagraphStyle(object):
@ -194,6 +208,7 @@ class ParagraphStyle(object):
# Misc. # Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color', 'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
'numbering',
) )
def __init__(self, pPr=None): def __init__(self, pPr=None):
@ -210,7 +225,7 @@ class ParagraphStyle(object):
): ):
setattr(self, p, binary_property(pPr, p)) setattr(self, p, binary_property(pPr, p))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd', 'numbering'):
f = globals()['read_%s' % x] f = globals()['read_%s' % x]
f(pPr, self) f(pPr, self)

View File

@ -6,6 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from collections import Counter
from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get from calibre.ebooks.docx.names import XPath, get
@ -33,10 +38,26 @@ class Level(object):
self.fmt = 'decimal' self.fmt = 'decimal'
self.para_link = None self.para_link = None
self.paragraph_style = self.character_style = None self.paragraph_style = self.character_style = None
self.is_numbered = False
self.num_template = None
if lvl is not None: if lvl is not None:
self.read_from_xml(lvl) self.read_from_xml(lvl)
def copy(self):
ans = Level()
for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
setattr(ans, x, getattr(self, x))
return ans
def format_template(self, counter, ilvl):
def sub(m):
x = int(m.group(1)) - 1
if x > ilvl or x not in counter:
return ''
return '%d' % (counter[x] - (0 if x == ilvl else 1))
return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False): def read_from_xml(self, lvl, override=False):
for lr in XPath('./w:lvlRestart[@w:val]')(lvl): for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try: try:
@ -57,9 +78,13 @@ class Level(object):
for lr in XPath('./w:numFmt[@w:val]')(lvl): for lr in XPath('./w:numFmt[@w:val]')(lvl):
val = get(lr, 'w:val') val = get(lr, 'w:val')
if val == 'bullet': if val == 'bullet':
self.is_numbered = False
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else: else:
self.is_numbered = True
self.fmt = STYLE_MAP.get(val, 'decimal') self.fmt = STYLE_MAP.get(val, 'decimal')
if lt and re.match(r'%\d+\.$', lt) is None:
self.num_template = lt
for lr in XPath('./w:pStyle[@w:val]')(lvl): for lr in XPath('./w:pStyle[@w:val]')(lvl):
self.para_link = get(lr, 'w:val') self.para_link = get(lr, 'w:val')
@ -78,12 +103,6 @@ class Level(object):
else: else:
self.character_style.update(ps) self.character_style.update(ps)
def copy(self):
ans = Level()
for x in ('restart', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style'):
setattr(ans, x, getattr(self, x))
return ans
class NumberingDefinition(object): class NumberingDefinition(object):
def __init__(self, parent=None): def __init__(self, parent=None):
@ -107,6 +126,7 @@ class Numbering(object):
def __init__(self): def __init__(self):
self.definitions = {} self.definitions = {}
self.instances = {} self.instances = {}
self.counters = {}
def __call__(self, root, styles): def __call__(self, root, styles):
' Read all numbering style definitions ' ' Read all numbering style definitions '
@ -131,6 +151,7 @@ class Numbering(object):
if alvl is None: if alvl is None:
alvl = Level() alvl = Level()
alvl.read_from_xml(lvl, override=True) alvl.read_from_xml(lvl, override=True)
return nd
next_pass = {} next_pass = {}
for n in XPath('./w:num[@w:numId]')(root): for n in XPath('./w:num[@w:numId]')(root):
@ -154,3 +175,114 @@ class Numbering(object):
if d is not None: if d is not None:
self.instances[num_id] = create_instance(n, d) self.instances[num_id] = create_instance(n, d)
for num_id, d in self.instances.iteritems():
self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels})
def get_pstyle(self, num_id, style_id):
d = self.instances.get(num_id, None)
if d is not None:
for ilvl, lvl in d.levels.iteritems():
if lvl.para_link == style_id:
return ilvl
def get_para_style(self, num_id, lvl):
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(lvl, None)
return getattr(lvl, 'paragraph_style', None)
def update_counter(self, counter, levelnum, levels):
counter[levelnum] += 1
for ilvl, lvl in levels.iteritems():
restart = lvl.restart
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
counter[ilvl] = lvl.start
def apply_markup(self, items, body, styles, object_map):
for p, num_id, ilvl in items:
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(ilvl, None)
if lvl is not None:
counter = self.counters[num_id]
p.tag = 'li'
p.set('value', '%s' % counter[ilvl])
p.set('list-lvl', str(ilvl))
p.set('list-id', num_id)
if lvl.num_template is not None:
val = lvl.format_template(counter, ilvl)
p.set('list-template', val)
self.update_counter(counter, ilvl, d.levels)
def commit(current_run):
if not current_run:
return
start = current_run[0]
parent = start.getparent()
idx = parent.index(start)
d = self.instances[start.get('list-id')]
ilvl = int(start.get('list-lvl'))
lvl = d.levels[ilvl]
lvlid = start.get('list-id') + start.get('list-lvl')
wrap = (OL if lvl.is_numbered else UL)('\n\t')
has_template = 'list-template' in start.attrib
if has_template:
wrap.set('lvlid', lvlid)
else:
wrap.set('class', styles.register({'list-style-type': lvl.fmt}, 'list'))
parent.insert(idx, wrap)
last_val = None
for child in current_run:
wrap.append(child)
child.tail = '\n\t'
if has_template:
span = SPAN()
span.text = child.text
child.text = None
for gc in child:
span.append(gc)
child.append(span)
span = SPAN(child.get('list-template'))
child.insert(0, span)
for attr in ('list-lvl', 'list-id', 'list-template'):
child.attrib.pop(attr, None)
val = int(child.get('value'))
if last_val == val - 1 or wrap.tag == 'ul':
child.attrib.pop('value')
last_val = val
current_run[-1].tail = '\n'
del current_run[:]
parents = set()
for child in body.iterdescendants('li'):
parents.add(child.getparent())
for parent in parents:
current_run = []
for child in parent:
if child.tag == 'li':
if current_run:
last = current_run[-1]
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
commit(current_run)
current_run.append(child)
else:
commit(current_run)
commit(current_run)
for wrap in body.xpath('//ol[@lvlid]'):
wrap.attrib.pop('lvlid')
wrap.tag = 'div'
for i, li in enumerate(wrap.iterchildren('li')):
li.tag = 'div'
li.attrib.pop('value', None)
li.set('style', 'display:table-row')
obj = object_map[li]
bs = styles.para_cache[obj]
if i == 0:
wrap.set('style', 'display:table; margin-left: %s' % (bs.css.get('margin-left', 0)))
bs.css.pop('margin-left', None)
for child in li:
child.set('style', 'display:table-cell')

View File

@ -198,8 +198,19 @@ class Styles(object):
if default_para.character_style is not None: if default_para.character_style is not None:
self.para_char_cache[p] = default_para.character_style self.para_char_cache[p] = default_para.character_style
is_numbering = direct_formatting.numbering is not inherit
if is_numbering:
num_id, lvl = direct_formatting.numbering
if num_id is not None:
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
if num_id is not None and lvl is not None:
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
for attr in ans.all_properties: for attr in ans.all_properties:
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr)) if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
return ans return ans
def resolve_run(self, r): def resolve_run(self, r):
@ -244,10 +255,20 @@ class Styles(object):
return self.resolve_run(obj) return self.resolve_run(obj)
def resolve_numbering(self, numbering): def resolve_numbering(self, numbering):
pass # TODO: Implement this # When a numPr element appears inside a paragraph style, the lvl info
# must be discarder and pStyle used instead.
self.numbering = numbering
for style in self:
ps = style.paragraph_style
if ps is not None and ps.numbering is not inherit:
lvl = numbering.get_pstyle(ps.numbering[0], style.style_id)
if lvl is None:
ps.numbering = inherit
else:
ps.numbering = (ps.numbering[0], lvl)
def register(self, css, prefix): def register(self, css, prefix):
h = hash(tuple(css.iteritems())) h = hash(frozenset(css.iteritems()))
ans, _ = self.classes.get(h, (None, None)) ans, _ = self.classes.get(h, (None, None))
if ans is None: if ans is None:
self.counter[prefix] += 1 self.counter[prefix] += 1
@ -266,13 +287,15 @@ class Styles(object):
self.register(css, 'text') self.register(css, 'text')
def class_name(self, css): def class_name(self, css):
h = hash(tuple(css.iteritems())) h = hash(frozenset(css.iteritems()))
return self.classes.get(h, (None, None))[0] return self.classes.get(h, (None, None))[0]
def generate_css(self): def generate_css(self):
prefix = textwrap.dedent( prefix = textwrap.dedent(
'''\ '''\
p { margin: 0; padding: 0; text-indent: 1.5em } p { text-indent: 1.5em }
ul, ol, p { margin: 0; padding: 0 }
''') ''')
ans = [] ans = []

View File

@ -7,6 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re import sys, os, re
from collections import OrderedDict
from lxml import html from lxml import html
from lxml.html.builder import ( from lxml.html.builder import (
@ -36,7 +37,7 @@ class Convert(object):
self.mi = self.docx.metadata self.mi = self.docx.metadata
self.body = BODY() self.body = BODY()
self.styles = Styles() self.styles = Styles()
self.object_map = {} self.object_map = OrderedDict()
self.html = HTML( self.html = HTML(
HEAD( HEAD(
META(charset='utf-8'), META(charset='utf-8'),
@ -72,6 +73,19 @@ class Convert(object):
pass # TODO: Last section properties pass # TODO: Last section properties
else: else:
self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag)) self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag))
numbered = []
for html_obj, obj in self.object_map.iteritems():
raw = obj.get('calibre_num_id', None)
if raw is not None:
lvl, num_id = raw.partition(':')[0::2]
try:
lvl = int(lvl)
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
if len(self.body) > 0: if len(self.body) > 0:
self.body.text = '\n\t' self.body.text = '\n\t'
for child in self.body: for child in self.body:
@ -102,7 +116,7 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml') nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml') sname = get_name(STYLES, 'styles.xml')
numbering = Numbering() numbering = self.numbering = Numbering()
if sname is not None: if sname is not None:
try: try:
@ -133,6 +147,7 @@ class Convert(object):
def convert_p(self, p): def convert_p(self, p):
dest = P() dest = P()
self.object_map[dest] = p
style = self.styles.resolve_paragraph(p) style = self.styles.resolve_paragraph(p)
for run in XPath('descendant::w:r')(p): for run in XPath('descendant::w:r')(p):
span = self.convert_run(run) span = self.convert_run(run)
@ -173,7 +188,6 @@ class Convert(object):
wrapper = self.wrap_elems(spans, SPAN()) wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls) wrapper.set('class', cls)
self.object_map[dest] = p
return dest return dest
def wrap_elems(self, elems, wrapper): def wrap_elems(self, elems, wrapper):
@ -188,7 +202,7 @@ class Convert(object):
def convert_run(self, run): def convert_run(self, run):
ans = SPAN() ans = SPAN()
ans.run = run self.object_map[ans] = run
text = Text(ans, 'text', []) text = Text(ans, 'text', [])
for child in run: for child in run:
@ -224,7 +238,6 @@ class Convert(object):
ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup' ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
if style.lang is not inherit: if style.lang is not inherit:
ans.lang = style.lang ans.lang = style.lang
self.object_map[ans] = run
return ans return ans
if __name__ == '__main__': if __name__ == '__main__':