From ec29e28944a4e8cd28c5371476ce22ab5abec54f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 9 Sep 2013 11:13:35 +0530
Subject: [PATCH] DOCX Input: Fix various issues with numbering->lists

DOCX Input: Handle numbered paragraphs where the numbering is specified
in the paragraph style, instead of on the paragraph directly. Also
support the use of arbitrary, styled text for bullets.
---
 src/calibre/ebooks/docx/char_styles.py |  6 +--
 src/calibre/ebooks/docx/cleanup.py     | 14 ++----
 src/calibre/ebooks/docx/numbering.py   | 66 ++++++++++++++++----------
 src/calibre/ebooks/docx/styles.py      | 14 ++++--
 src/calibre/ebooks/docx/to_html.py     | 14 +++++-
 5 files changed, 71 insertions(+), 43 deletions(-)
diff --git a/src/calibre/ebooks/docx/char_styles.py b/src/calibre/ebooks/docx/char_styles.py
index c9a2fee4c9..e4c9383c68 100644
--- a/src/calibre/ebooks/docx/char_styles.py
+++ b/src/calibre/ebooks/docx/char_styles.py
@@ -196,9 +196,9 @@ class RunStyle(object):
             td = set()
             if self.text_decoration is not inherit:
                 td.add(self.text_decoration)
-            if self.strike:
+            if self.strike and self.strike is not inherit:
                 td.add('line-through')
-            if self.dstrike:
+            if self.dstrike and self.dstrike is not inherit:
                 td.add('line-through')
             if td:
                 c['text-decoration'] = ' '.join(td)
@@ -206,7 +206,7 @@ class RunStyle(object):
                 c['text-transform'] = 'uppercase'
             if self.i is True:
                 c['font-style'] = 'italic'
-            if self.shadow:
+            if self.shadow and self.shadow is not inherit:
                 c['text-shadow'] = '2px 2px'
             if self.smallCaps is True:
                 c['font-variant'] = 'small-caps'
diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py
index 8de6dff0ba..bb421afc89 100644
--- a/src/calibre/ebooks/docx/cleanup.py
+++ b/src/calibre/ebooks/docx/cleanup.py
@@ -10,8 +10,6 @@ import os
 
 from calibre.ebooks.docx.names import XPath
 
-NBSP = '\xa0'
-
 def mergeable(previous, current):
     if previous.tail or current.tail:
         return False
@@ -116,7 +114,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
 
     # Merge consecutive spans that have the same styling
     current_run = []
-    for span in root.xpath('//span'):
+    for span in root.xpath('//span[not(@style)]'):
         if not current_run:
             current_run.append(span)
         else:
@@ -149,7 +147,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                     parent.append(child)
 
     # Make spans whose only styling is bold or italic into <b> and <i> tags
-    for span in root.xpath('//span[@class]'):
+    for span in root.xpath('//span[@class and not(@style)]'):
         css = class_map.get(span.get('class', None), {})
         if len(css) == 1:
             if css == {'font-style':'italic'}:
@@ -160,15 +158,9 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                 del span.attrib['class']
 
     # Get rid of <span>s that have no styling
-    for span in root.xpath('//span[not(@class) and not(@id)]'):
+    for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
         lift(span)
 
-    # If a paragraph ends with a <br>, that <br> is not rendered in HTML, but
-    # it is in Word, so add a trailing space to ensure it is rendered.
-    for br in root.xpath('//*[contains("p,h1,h2,h3,h4,h5,h6,li", name())]/node()[position()=last()]/self::br'):
-        if not br.tail:
-            br.tail = NBSP
-
     if detect_cover:
         # Check if the first image in the document is possibly a cover
         img = root.xpath('//img[@src][1]')
diff --git a/src/calibre/ebooks/docx/numbering.py b/src/calibre/ebooks/docx/numbering.py
index 2bf86eea27..602689a8cd 100644
--- a/src/calibre/ebooks/docx/numbering.py
+++ b/src/calibre/ebooks/docx/numbering.py
@@ -12,7 +12,7 @@ from collections import Counter
 from lxml.html.builder import OL, UL, SPAN
 
 from calibre.ebooks.docx.block_styles import ParagraphStyle
-from calibre.ebooks.docx.char_styles import RunStyle
+from calibre.ebooks.docx.char_styles import RunStyle, inherit
 from calibre.ebooks.docx.names import XPath, get
 
 STYLE_MAP = {
@@ -40,6 +40,7 @@ class Level(object):
         self.paragraph_style = self.character_style = None
         self.is_numbered = False
         self.num_template = None
+        self.bullet_template = None
         self.pic_id = None
 
         if lvl is not None:
@@ -47,17 +48,17 @@ class Level(object):
 
     def copy(self):
         ans = Level()
-        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
+        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
             setattr(ans, x, getattr(self, x))
         return ans
 
-    def format_template(self, counter, ilvl):
+    def format_template(self, counter, ilvl, template):
         def sub(m):
             x = int(m.group(1)) - 1
             if x > ilvl or x not in counter:
                 return ''
             return '%d' % (counter[x] - (0 if x == ilvl else 1))
-        return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
+        return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
 
     def read_from_xml(self, lvl, override=False):
         for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
@@ -72,6 +73,13 @@ class Level(object):
             except (TypeError, ValueError):
                 pass
 
+        for rPr in XPath('./w:rPr')(lvl):
+            ps = RunStyle(rPr)
+            if self.character_style is None:
+                self.character_style = ps
+            else:
+                self.character_style.update(ps)
+
         lt = None
         for lr in XPath('./w:lvlText[@w:val]')(lvl):
             lt = get(lr, 'w:val')
@@ -80,7 +88,12 @@ class Level(object):
             val = get(lr, 'w:val')
             if val == 'bullet':
                 self.is_numbered = False
-                self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                cs = self.character_style
+                if lt in {'\uf0a7', 'o'} or (
+                    cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
+                    self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                else:
+                    self.bullet_template = lt
                 for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
                     self.pic_id = get(lpid, 'w:val')
             else:
@@ -99,13 +112,6 @@ class Level(object):
             else:
                 self.paragraph_style.update(ps)
 
-        for rPr in XPath('./w:rPr')(lvl):
-            ps = RunStyle(rPr)
-            if self.character_style is None:
-                self.character_style = ps
-            else:
-                self.character_style.update(ps)
-
     def css(self, images, pic_map, rid_map):
         ans = {'list-style-type': self.fmt}
         if self.pic_id:
@@ -119,6 +125,14 @@ class Level(object):
                     ans['list-style-image'] = 'url("images/%s")' % fname
         return ans
 
+    def char_css(self):
+        try:
+            css = self.character_style.css
+        except AttributeError:
+            css = {}
+        css.pop('font-family', None)
+        return css
+
 class NumberingDefinition(object):
 
     def __init__(self, parent=None):
@@ -233,7 +247,10 @@ class Numbering(object):
                     p.set('list-lvl', str(ilvl))
                     p.set('list-id', num_id)
                     if lvl.num_template is not None:
-                        val = lvl.format_template(counter, ilvl)
+                        val = lvl.format_template(counter, ilvl, lvl.num_template)
+                        p.set('list-template', val)
+                    elif lvl.bullet_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.bullet_template)
                         p.set('list-template', val)
                     self.update_counter(counter, ilvl, d.levels)
 
@@ -250,12 +267,15 @@ class Numbering(object):
             ilvl = int(start.get('list-lvl'))
             lvl = d.levels[ilvl]
             lvlid = start.get('list-id') + start.get('list-lvl')
-            wrap = (OL if lvl.is_numbered else UL)('\n\t')
             has_template = 'list-template' in start.attrib
+            wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
             if has_template:
                 wrap.set('lvlid', lvlid)
             else:
                 wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
+            ccss = lvl.char_css()
+            if ccss:
+                ccss = styles.register(ccss, 'bullet')
             parent.insert(idx, wrap)
             last_val = None
             for child in current_run:
@@ -269,6 +289,8 @@ class Numbering(object):
                         span.append(gc)
                     child.append(span)
                     span = SPAN(child.get('list-template'))
+                    if ccss:
+                        span.set('class', ccss)
                     last = templates.get(lvlid, '')
                     if span.text and len(span.text) > len(last):
                         templates[lvlid] = span.text
@@ -299,15 +321,12 @@ class Numbering(object):
                     commit(current_run)
             commit(current_run)
 
+        # Convert the list items that use custom text for bullets into tables
+        # so that they display correctly
         for wrap in body.xpath('//ol[@lvlid]'):
-            lvlid = wrap.attrib.pop('lvlid')
+            wrap.attrib.pop('lvlid')
             wrap.tag = 'div'
-            text = ''
-            maxtext = templates.get(lvlid, '').replace('.', '')[:-1]
-            for li in wrap.iterchildren('li'):
-                t = li[0].text
-                if t and len(t) > len(text):
-                    text = t
+            wrap.set('style', 'display:table')
             for i, li in enumerate(wrap.iterchildren('li')):
                 li.tag = 'div'
                 li.attrib.pop('value', None)
@@ -315,9 +334,8 @@ class Numbering(object):
                 obj = object_map[li]
                 bs = styles.para_cache[obj]
                 if i == 0:
-                    m = len(maxtext)  # Move the table left to simulate the behavior of a list (number is to the left of text margin)
-                    wrap.set('style', 'display:table; margin-left: -%dem; padding-left: %s' % (m, bs.css.get('margin-left', 0)))
+                    wrap.set('style', 'display:table; padding-left:%s' %
+                             bs.css.get('margin-left', '0'))
                 bs.css.pop('margin-left', None)
                 for child in li:
                     child.set('style', 'display:table-cell')
-
diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py
index 1201f696c2..142cb9e89d 100644
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@@ -92,7 +92,7 @@ class Style(object):
                 else:
                     self.character_style.update(rs)
 
-        if self.style_type == 'numbering':
+        if self.style_type in {'numbering', 'paragraph'}:
             self.numbering_style_link = None
             for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
                 self.numbering_style_link = get(x, 'w:val')
@@ -150,7 +150,7 @@ class Styles(object):
                 self.id_map[s.style_id] = s
             if s.is_default:
                 self.default_styles[s.style_type] = s
-            if s.style_type == 'numbering' and s.numbering_style_link:
+            if getattr(s, 'numbering_style_link', None) is not None:
                 self.numbering_style_links[s.style_id] = s.numbering_style_link
 
         self.default_paragraph_style = self.default_character_style = None
@@ -212,6 +212,7 @@ class Styles(object):
     def resolve_paragraph(self, p):
         ans = self.para_cache.get(p, None)
         if ans is None:
+            linked_style = None
             ans = self.para_cache[p] = ParagraphStyle()
             ans.style_name = None
             direct_formatting = None
@@ -233,7 +234,7 @@ class Styles(object):
 
             default_para = self.default_styles.get('paragraph', None)
             if direct_formatting.linked_style is not None:
-                ls = self.get(direct_formatting.linked_style)
+                ls = linked_style = self.get(direct_formatting.linked_style)
                 if ls is not None:
                     ans.style_name = ls.name
                     ps = ls.paragraph_style
@@ -256,6 +257,11 @@ class Styles(object):
                     ps = self.numbering.get_para_style(num_id, lvl)
                     if ps is not None:
                         parent_styles.append(ps)
+            if not is_numbering and linked_style is not None and getattr(linked_style.paragraph_style, 'numbering', inherit) is not inherit:
+                num_id, lvl = linked_style.paragraph_style.numbering
+                if num_id is not None:
+                    p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                is_numbering = True
 
             for attr in ans.all_properties:
                 if not (is_numbering and attr == 'text_indent'):  # skip text-indent for lists
@@ -379,7 +385,7 @@ class Styles(object):
 
     def resolve_numbering(self, numbering):
         # When a numPr element appears inside a paragraph style, the lvl info
-        # must be discarder and pStyle used instead.
+        # must be discarded and pStyle used instead.
         self.numbering = numbering
         for style in self:
             ps = style.paragraph_style
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
index 103cf03e20..cdb50fcffd 100644
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -30,6 +30,8 @@ from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
 
+NBSP = '\xa0'
+
 class Text:
 
     def __init__(self, elem, attr, buf):
@@ -396,7 +398,17 @@ class Convert(object):
         if not dest.text and len(dest) == 0:
             # Empty paragraph add a non-breaking space so that it is rendered
             # by WebKit
-            dest.text = '\xa0'
+            dest.text = NBSP
+
+        # If the last element in a block is a <br> the <br> is not rendered in
+        # HTML, unless it is followed by a trailing space. Word, on the other
+        # hand inserts a blank line for trailing <br>s.
+        if len(dest) > 0 and not dest[-1].tail:
+            if dest[-1].tag == 'br':
+                dest[-1].tail = NBSP
+            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
+                dest[-1][-1].tail = NBSP
+
         return dest
 
     def wrap_elems(self, elems, wrapper):