From ec29e28944a4e8cd28c5371476ce22ab5abec54f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 9 Sep 2013 11:13:35 +0530 Subject: [PATCH] DOCX Input: Fix various issues with numbering->lists DOCX Input: Handle numbered paragraphs where the numbering is specified in the paragraph style, instead of on the paragraph directly. Also support the use of arbitrary, styled text for bullets. --- src/calibre/ebooks/docx/char_styles.py | 6 +-- src/calibre/ebooks/docx/cleanup.py | 14 ++---- src/calibre/ebooks/docx/numbering.py | 66 ++++++++++++++++---------- src/calibre/ebooks/docx/styles.py | 14 ++++-- src/calibre/ebooks/docx/to_html.py | 14 +++++- 5 files changed, 71 insertions(+), 43 deletions(-) diff --git a/src/calibre/ebooks/docx/char_styles.py b/src/calibre/ebooks/docx/char_styles.py index c9a2fee4c9..e4c9383c68 100644 --- a/src/calibre/ebooks/docx/char_styles.py +++ b/src/calibre/ebooks/docx/char_styles.py @@ -196,9 +196,9 @@ class RunStyle(object): td = set() if self.text_decoration is not inherit: td.add(self.text_decoration) - if self.strike: + if self.strike and self.strike is not inherit: td.add('line-through') - if self.dstrike: + if self.dstrike and self.dstrike is not inherit: td.add('line-through') if td: c['text-decoration'] = ' '.join(td) @@ -206,7 +206,7 @@ class RunStyle(object): c['text-transform'] = 'uppercase' if self.i is True: c['font-style'] = 'italic' - if self.shadow: + if self.shadow and self.shadow is not inherit: c['text-shadow'] = '2px 2px' if self.smallCaps is True: c['font-variant'] = 'small-caps' diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 8de6dff0ba..bb421afc89 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -10,8 +10,6 @@ import os from calibre.ebooks.docx.names import XPath -NBSP = '\xa0' - def mergeable(previous, current): if previous.tail or current.tail: return False @@ -116,7 +114,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): # Merge consecutive spans that have the same styling current_run = [] - for span in root.xpath('//span'): + for span in root.xpath('//span[not(@style)]'): if not current_run: current_run.append(span) else: @@ -149,7 +147,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): parent.append(child) # Make spans whose only styling is bold or italic into and tags - for span in root.xpath('//span[@class]'): + for span in root.xpath('//span[@class and not(@style)]'): css = class_map.get(span.get('class', None), {}) if len(css) == 1: if css == {'font-style':'italic'}: @@ -160,15 +158,9 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): del span.attrib['class'] # Get rid of s that have no styling - for span in root.xpath('//span[not(@class) and not(@id)]'): + for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'): lift(span) - # If a paragraph ends with a
, that
is not rendered in HTML, but - # it is in Word, so add a trailing space to ensure it is rendered. - for br in root.xpath('//*[contains("p,h1,h2,h3,h4,h5,h6,li", name())]/node()[position()=last()]/self::br'): - if not br.tail: - br.tail = NBSP - if detect_cover: # Check if the first image in the document is possibly a cover img = root.xpath('//img[@src][1]') diff --git a/src/calibre/ebooks/docx/numbering.py b/src/calibre/ebooks/docx/numbering.py index 2bf86eea27..602689a8cd 100644 --- a/src/calibre/ebooks/docx/numbering.py +++ b/src/calibre/ebooks/docx/numbering.py @@ -12,7 +12,7 @@ from collections import Counter from lxml.html.builder import OL, UL, SPAN from calibre.ebooks.docx.block_styles import ParagraphStyle -from calibre.ebooks.docx.char_styles import RunStyle +from calibre.ebooks.docx.char_styles import RunStyle, inherit from calibre.ebooks.docx.names import XPath, get STYLE_MAP = { @@ -40,6 +40,7 @@ class Level(object): self.paragraph_style = self.character_style = None self.is_numbered = False self.num_template = None + self.bullet_template = None self.pic_id = None if lvl is not None: @@ -47,17 +48,17 @@ class Level(object): def copy(self): ans = Level() - for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'): + for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'): setattr(ans, x, getattr(self, x)) return ans - def format_template(self, counter, ilvl): + def format_template(self, counter, ilvl, template): def sub(m): x = int(m.group(1)) - 1 if x > ilvl or x not in counter: return '' return '%d' % (counter[x] - (0 if x == ilvl else 1)) - return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0' + return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0' def read_from_xml(self, lvl, override=False): for lr in XPath('./w:lvlRestart[@w:val]')(lvl): @@ -72,6 +73,13 @@ class Level(object): except (TypeError, ValueError): pass + for rPr in XPath('./w:rPr')(lvl): + ps = RunStyle(rPr) + if self.character_style is None: + self.character_style = ps + else: + self.character_style.update(ps) + lt = None for lr in XPath('./w:lvlText[@w:val]')(lvl): lt = get(lr, 'w:val') @@ -80,7 +88,12 @@ class Level(object): val = get(lr, 'w:val') if val == 'bullet': self.is_numbered = False - self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') + cs = self.character_style + if lt in {'\uf0a7', 'o'} or ( + cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}): + self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc') + else: + self.bullet_template = lt for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl): self.pic_id = get(lpid, 'w:val') else: @@ -99,13 +112,6 @@ class Level(object): else: self.paragraph_style.update(ps) - for rPr in XPath('./w:rPr')(lvl): - ps = RunStyle(rPr) - if self.character_style is None: - self.character_style = ps - else: - self.character_style.update(ps) - def css(self, images, pic_map, rid_map): ans = {'list-style-type': self.fmt} if self.pic_id: @@ -119,6 +125,14 @@ class Level(object): ans['list-style-image'] = 'url("images/%s")' % fname return ans + def char_css(self): + try: + css = self.character_style.css + except AttributeError: + css = {} + css.pop('font-family', None) + return css + class NumberingDefinition(object): def __init__(self, parent=None): @@ -233,7 +247,10 @@ class Numbering(object): p.set('list-lvl', str(ilvl)) p.set('list-id', num_id) if lvl.num_template is not None: - val = lvl.format_template(counter, ilvl) + val = lvl.format_template(counter, ilvl, lvl.num_template) + p.set('list-template', val) + elif lvl.bullet_template is not None: + val = lvl.format_template(counter, ilvl, lvl.bullet_template) p.set('list-template', val) self.update_counter(counter, ilvl, d.levels) @@ -250,12 +267,15 @@ class Numbering(object): ilvl = int(start.get('list-lvl')) lvl = d.levels[ilvl] lvlid = start.get('list-id') + start.get('list-lvl') - wrap = (OL if lvl.is_numbered else UL)('\n\t') has_template = 'list-template' in start.attrib + wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t') if has_template: wrap.set('lvlid', lvlid) else: wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list')) + ccss = lvl.char_css() + if ccss: + ccss = styles.register(ccss, 'bullet') parent.insert(idx, wrap) last_val = None for child in current_run: @@ -269,6 +289,8 @@ class Numbering(object): span.append(gc) child.append(span) span = SPAN(child.get('list-template')) + if ccss: + span.set('class', ccss) last = templates.get(lvlid, '') if span.text and len(span.text) > len(last): templates[lvlid] = span.text @@ -299,15 +321,12 @@ class Numbering(object): commit(current_run) commit(current_run) + # Convert the list items that use custom text for bullets into tables + # so that they display correctly for wrap in body.xpath('//ol[@lvlid]'): - lvlid = wrap.attrib.pop('lvlid') + wrap.attrib.pop('lvlid') wrap.tag = 'div' - text = '' - maxtext = templates.get(lvlid, '').replace('.', '')[:-1] - for li in wrap.iterchildren('li'): - t = li[0].text - if t and len(t) > len(text): - text = t + wrap.set('style', 'display:table') for i, li in enumerate(wrap.iterchildren('li')): li.tag = 'div' li.attrib.pop('value', None) @@ -315,9 +334,8 @@ class Numbering(object): obj = object_map[li] bs = styles.para_cache[obj] if i == 0: - m = len(maxtext) # Move the table left to simulate the behavior of a list (number is to the left of text margin) - wrap.set('style', 'display:table; margin-left: -%dem; padding-left: %s' % (m, bs.css.get('margin-left', 0))) + wrap.set('style', 'display:table; padding-left:%s' % + bs.css.get('margin-left', '0')) bs.css.pop('margin-left', None) for child in li: child.set('style', 'display:table-cell') - diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index 1201f696c2..142cb9e89d 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -92,7 +92,7 @@ class Style(object): else: self.character_style.update(rs) - if self.style_type == 'numbering': + if self.style_type in {'numbering', 'paragraph'}: self.numbering_style_link = None for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem): self.numbering_style_link = get(x, 'w:val') @@ -150,7 +150,7 @@ class Styles(object): self.id_map[s.style_id] = s if s.is_default: self.default_styles[s.style_type] = s - if s.style_type == 'numbering' and s.numbering_style_link: + if getattr(s, 'numbering_style_link', None) is not None: self.numbering_style_links[s.style_id] = s.numbering_style_link self.default_paragraph_style = self.default_character_style = None @@ -212,6 +212,7 @@ class Styles(object): def resolve_paragraph(self, p): ans = self.para_cache.get(p, None) if ans is None: + linked_style = None ans = self.para_cache[p] = ParagraphStyle() ans.style_name = None direct_formatting = None @@ -233,7 +234,7 @@ class Styles(object): default_para = self.default_styles.get('paragraph', None) if direct_formatting.linked_style is not None: - ls = self.get(direct_formatting.linked_style) + ls = linked_style = self.get(direct_formatting.linked_style) if ls is not None: ans.style_name = ls.name ps = ls.paragraph_style @@ -256,6 +257,11 @@ class Styles(object): ps = self.numbering.get_para_style(num_id, lvl) if ps is not None: parent_styles.append(ps) + if not is_numbering and linked_style is not None and getattr(linked_style.paragraph_style, 'numbering', inherit) is not inherit: + num_id, lvl = linked_style.paragraph_style.numbering + if num_id is not None: + p.set('calibre_num_id', '%s:%s' % (lvl, num_id)) + is_numbering = True for attr in ans.all_properties: if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists @@ -379,7 +385,7 @@ class Styles(object): def resolve_numbering(self, numbering): # When a numPr element appears inside a paragraph style, the lvl info - # must be discarder and pStyle used instead. + # must be discarded and pStyle used instead. self.numbering = numbering for style in self: ps = style.paragraph_style diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 103cf03e20..cdb50fcffd 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -30,6 +30,8 @@ from calibre.ebooks.docx.fields import Fields from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 +NBSP = '\xa0' + class Text: def __init__(self, elem, attr, buf): @@ -396,7 +398,17 @@ class Convert(object): if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit - dest.text = '\xa0' + dest.text = NBSP + + # If the last element in a block is a
the
is not rendered in + # HTML, unless it is followed by a trailing space. Word, on the other + # hand inserts a blank line for trailing
s. + if len(dest) > 0 and not dest[-1].tail: + if dest[-1].tag == 'br': + dest[-1].tail = NBSP + elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: + dest[-1][-1].tail = NBSP + return dest def wrap_elems(self, elems, wrapper):