DOCX Input: Fix various issues with numbering->lists

DOCX Input: Handle numbered paragraphs where the numbering is specified
in the paragraph style, instead of on the paragraph directly. Also
support the use of arbitrary, styled text for bullets.
This commit is contained in:
Kovid Goyal 2013-09-09 11:13:35 +05:30
parent 14c1c6aabe
commit ec29e28944
5 changed files with 71 additions and 43 deletions

View File

@ -196,9 +196,9 @@ class RunStyle(object):
td = set()
if self.text_decoration is not inherit:
td.add(self.text_decoration)
if self.strike:
if self.strike and self.strike is not inherit:
td.add('line-through')
if self.dstrike:
if self.dstrike and self.dstrike is not inherit:
td.add('line-through')
if td:
c['text-decoration'] = ' '.join(td)
@ -206,7 +206,7 @@ class RunStyle(object):
c['text-transform'] = 'uppercase'
if self.i is True:
c['font-style'] = 'italic'
if self.shadow:
if self.shadow and self.shadow is not inherit:
c['text-shadow'] = '2px 2px'
if self.smallCaps is True:
c['font-variant'] = 'small-caps'

View File

@ -10,8 +10,6 @@ import os
from calibre.ebooks.docx.names import XPath
NBSP = '\xa0'
def mergeable(previous, current):
if previous.tail or current.tail:
return False
@ -116,7 +114,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
for span in root.xpath('//span[not(@style)]'):
if not current_run:
current_run.append(span)
else:
@ -149,7 +147,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
parent.append(child)
# Make spans whose only styling is bold or italic into <b> and <i> tags
for span in root.xpath('//span[@class]'):
for span in root.xpath('//span[@class and not(@style)]'):
css = class_map.get(span.get('class', None), {})
if len(css) == 1:
if css == {'font-style':'italic'}:
@ -160,15 +158,9 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class) and not(@id)]'):
for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
lift(span)
# If a paragraph ends with a <br>, that <br> is not rendered in HTML, but
# it is in Word, so add a trailing space to ensure it is rendered.
for br in root.xpath('//*[contains("p,h1,h2,h3,h4,h5,h6,li", name())]/node()[position()=last()]/self::br'):
if not br.tail:
br.tail = NBSP
if detect_cover:
# Check if the first image in the document is possibly a cover
img = root.xpath('//img[@src][1]')

View File

@ -12,7 +12,7 @@ from collections import Counter
from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.char_styles import RunStyle, inherit
from calibre.ebooks.docx.names import XPath, get
STYLE_MAP = {
@ -40,6 +40,7 @@ class Level(object):
self.paragraph_style = self.character_style = None
self.is_numbered = False
self.num_template = None
self.bullet_template = None
self.pic_id = None
if lvl is not None:
@ -47,17 +48,17 @@ class Level(object):
def copy(self):
ans = Level()
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
setattr(ans, x, getattr(self, x))
return ans
def format_template(self, counter, ilvl):
def format_template(self, counter, ilvl, template):
def sub(m):
x = int(m.group(1)) - 1
if x > ilvl or x not in counter:
return ''
return '%d' % (counter[x] - (0 if x == ilvl else 1))
return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False):
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
@ -72,6 +73,13 @@ class Level(object):
except (TypeError, ValueError):
pass
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(rPr)
if self.character_style is None:
self.character_style = ps
else:
self.character_style.update(ps)
lt = None
for lr in XPath('./w:lvlText[@w:val]')(lvl):
lt = get(lr, 'w:val')
@ -80,7 +88,12 @@ class Level(object):
val = get(lr, 'w:val')
if val == 'bullet':
self.is_numbered = False
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
cs = self.character_style
if lt in {'\uf0a7', 'o'} or (
cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else:
self.bullet_template = lt
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
self.pic_id = get(lpid, 'w:val')
else:
@ -99,13 +112,6 @@ class Level(object):
else:
self.paragraph_style.update(ps)
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(rPr)
if self.character_style is None:
self.character_style = ps
else:
self.character_style.update(ps)
def css(self, images, pic_map, rid_map):
ans = {'list-style-type': self.fmt}
if self.pic_id:
@ -119,6 +125,14 @@ class Level(object):
ans['list-style-image'] = 'url("images/%s")' % fname
return ans
def char_css(self):
try:
css = self.character_style.css
except AttributeError:
css = {}
css.pop('font-family', None)
return css
class NumberingDefinition(object):
def __init__(self, parent=None):
@ -233,7 +247,10 @@ class Numbering(object):
p.set('list-lvl', str(ilvl))
p.set('list-id', num_id)
if lvl.num_template is not None:
val = lvl.format_template(counter, ilvl)
val = lvl.format_template(counter, ilvl, lvl.num_template)
p.set('list-template', val)
elif lvl.bullet_template is not None:
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
p.set('list-template', val)
self.update_counter(counter, ilvl, d.levels)
@ -250,12 +267,15 @@ class Numbering(object):
ilvl = int(start.get('list-lvl'))
lvl = d.levels[ilvl]
lvlid = start.get('list-id') + start.get('list-lvl')
wrap = (OL if lvl.is_numbered else UL)('\n\t')
has_template = 'list-template' in start.attrib
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
if has_template:
wrap.set('lvlid', lvlid)
else:
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
ccss = lvl.char_css()
if ccss:
ccss = styles.register(ccss, 'bullet')
parent.insert(idx, wrap)
last_val = None
for child in current_run:
@ -269,6 +289,8 @@ class Numbering(object):
span.append(gc)
child.append(span)
span = SPAN(child.get('list-template'))
if ccss:
span.set('class', ccss)
last = templates.get(lvlid, '')
if span.text and len(span.text) > len(last):
templates[lvlid] = span.text
@ -299,15 +321,12 @@ class Numbering(object):
commit(current_run)
commit(current_run)
# Convert the list items that use custom text for bullets into tables
# so that they display correctly
for wrap in body.xpath('//ol[@lvlid]'):
lvlid = wrap.attrib.pop('lvlid')
wrap.attrib.pop('lvlid')
wrap.tag = 'div'
text = ''
maxtext = templates.get(lvlid, '').replace('.', '')[:-1]
for li in wrap.iterchildren('li'):
t = li[0].text
if t and len(t) > len(text):
text = t
wrap.set('style', 'display:table')
for i, li in enumerate(wrap.iterchildren('li')):
li.tag = 'div'
li.attrib.pop('value', None)
@ -315,9 +334,8 @@ class Numbering(object):
obj = object_map[li]
bs = styles.para_cache[obj]
if i == 0:
m = len(maxtext) # Move the table left to simulate the behavior of a list (number is to the left of text margin)
wrap.set('style', 'display:table; margin-left: -%dem; padding-left: %s' % (m, bs.css.get('margin-left', 0)))
wrap.set('style', 'display:table; padding-left:%s' %
bs.css.get('margin-left', '0'))
bs.css.pop('margin-left', None)
for child in li:
child.set('style', 'display:table-cell')

View File

@ -92,7 +92,7 @@ class Style(object):
else:
self.character_style.update(rs)
if self.style_type == 'numbering':
if self.style_type in {'numbering', 'paragraph'}:
self.numbering_style_link = None
for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = get(x, 'w:val')
@ -150,7 +150,7 @@ class Styles(object):
self.id_map[s.style_id] = s
if s.is_default:
self.default_styles[s.style_type] = s
if s.style_type == 'numbering' and s.numbering_style_link:
if getattr(s, 'numbering_style_link', None) is not None:
self.numbering_style_links[s.style_id] = s.numbering_style_link
self.default_paragraph_style = self.default_character_style = None
@ -212,6 +212,7 @@ class Styles(object):
def resolve_paragraph(self, p):
ans = self.para_cache.get(p, None)
if ans is None:
linked_style = None
ans = self.para_cache[p] = ParagraphStyle()
ans.style_name = None
direct_formatting = None
@ -233,7 +234,7 @@ class Styles(object):
default_para = self.default_styles.get('paragraph', None)
if direct_formatting.linked_style is not None:
ls = self.get(direct_formatting.linked_style)
ls = linked_style = self.get(direct_formatting.linked_style)
if ls is not None:
ans.style_name = ls.name
ps = ls.paragraph_style
@ -256,6 +257,11 @@ class Styles(object):
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
if not is_numbering and linked_style is not None and getattr(linked_style.paragraph_style, 'numbering', inherit) is not inherit:
num_id, lvl = linked_style.paragraph_style.numbering
if num_id is not None:
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
is_numbering = True
for attr in ans.all_properties:
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
@ -379,7 +385,7 @@ class Styles(object):
def resolve_numbering(self, numbering):
# When a numPr element appears inside a paragraph style, the lvl info
# must be discarder and pStyle used instead.
# must be discarded and pStyle used instead.
self.numbering = numbering
for style in self:
ps = style.paragraph_style

View File

@ -30,6 +30,8 @@ from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
NBSP = '\xa0'
class Text:
def __init__(self, elem, attr, buf):
@ -396,7 +398,17 @@ class Convert(object):
if not dest.text and len(dest) == 0:
# Empty paragraph add a non-breaking space so that it is rendered
# by WebKit
dest.text = '\xa0'
dest.text = NBSP
# If the last element in a block is a <br> the <br> is not rendered in
# HTML, unless it is followed by a trailing space. Word, on the other
# hand inserts a blank line for trailing <br>s.
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
dest[-1][-1].tail = NBSP
return dest
def wrap_elems(self, elems, wrapper):