DOCX Input: Fix various issues with numbering->lists

DOCX Input: Handle numbered paragraphs where the numbering is specified in the paragraph style, instead of on the paragraph directly. Also support the use of arbitrary, styled text for bullets.
2025-07-09 03:04:10 -04:00 · 2013-09-09 11:13:35 +05:30 · 2013-09-09 11:13:35 +05:30 · ec29e28944
commit ec29e28944
parent 14c1c6aabe
5 changed files with 71 additions and 43 deletions
--- a/src/calibre/ebooks/docx/char_styles.py
+++ b/src/calibre/ebooks/docx/char_styles.py
@ -196,9 +196,9 @@ class RunStyle(object):
            td = set()
            if self.text_decoration is not inherit:
                td.add(self.text_decoration)
-            if self.strike:
+            if self.strike and self.strike is not inherit:
                td.add('line-through')
-            if self.dstrike:
+            if self.dstrike and self.dstrike is not inherit:
                td.add('line-through')
            if td:
                c['text-decoration'] = ' '.join(td)
@ -206,7 +206,7 @@ class RunStyle(object):
                c['text-transform'] = 'uppercase'
            if self.i is True:
                c['font-style'] = 'italic'
-            if self.shadow:
+            if self.shadow and self.shadow is not inherit:
                c['text-shadow'] = '2px 2px'
            if self.smallCaps is True:
                c['font-variant'] = 'small-caps'
--- a/src/calibre/ebooks/docx/cleanup.py
+++ b/src/calibre/ebooks/docx/cleanup.py
@ -10,8 +10,6 @@ import os

 from calibre.ebooks.docx.names import XPath

-NBSP = '\xa0'
-
 def mergeable(previous, current):
    if previous.tail or current.tail:
        return False
@ -116,7 +114,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):

    # Merge consecutive spans that have the same styling
    current_run = []
-    for span in root.xpath('//span'):
+    for span in root.xpath('//span[not(@style)]'):
        if not current_run:
            current_run.append(span)
        else:
@ -149,7 +147,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                    parent.append(child)

    # Make spans whose only styling is bold or italic into <b> and <i> tags
-    for span in root.xpath('//span[@class]'):
+    for span in root.xpath('//span[@class and not(@style)]'):
        css = class_map.get(span.get('class', None), {})
        if len(css) == 1:
            if css == {'font-style':'italic'}:
@ -160,15 +158,9 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                del span.attrib['class']

    # Get rid of <span>s that have no styling
-    for span in root.xpath('//span[not(@class) and not(@id)]'):
+    for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
        lift(span)

-    # If a paragraph ends with a <br>, that <br> is not rendered in HTML, but
-    # it is in Word, so add a trailing space to ensure it is rendered.
-    for br in root.xpath('//*[contains("p,h1,h2,h3,h4,h5,h6,li", name())]/node()[position()=last()]/self::br'):
-        if not br.tail:
-            br.tail = NBSP
-
    if detect_cover:
        # Check if the first image in the document is possibly a cover
        img = root.xpath('//img[@src][1]')
--- a/src/calibre/ebooks/docx/numbering.py
+++ b/src/calibre/ebooks/docx/numbering.py
@ -12,7 +12,7 @@ from collections import Counter
 from lxml.html.builder import OL, UL, SPAN

 from calibre.ebooks.docx.block_styles import ParagraphStyle
-from calibre.ebooks.docx.char_styles import RunStyle
+from calibre.ebooks.docx.char_styles import RunStyle, inherit
 from calibre.ebooks.docx.names import XPath, get

 STYLE_MAP = {
@ -40,6 +40,7 @@ class Level(object):
        self.paragraph_style = self.character_style = None
        self.is_numbered = False
        self.num_template = None
+        self.bullet_template = None
        self.pic_id = None

        if lvl is not None:
@ -47,17 +48,17 @@ class Level(object):

    def copy(self):
        ans = Level()
-        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
+        for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
            setattr(ans, x, getattr(self, x))
        return ans

-    def format_template(self, counter, ilvl):
+    def format_template(self, counter, ilvl, template):
        def sub(m):
            x = int(m.group(1)) - 1
            if x > ilvl or x not in counter:
                return ''
            return '%d' % (counter[x] - (0 if x == ilvl else 1))
-        return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
+        return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'

    def read_from_xml(self, lvl, override=False):
        for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
@ -72,6 +73,13 @@ class Level(object):
            except (TypeError, ValueError):
                pass

+        for rPr in XPath('./w:rPr')(lvl):
+            ps = RunStyle(rPr)
+            if self.character_style is None:
+                self.character_style = ps
+            else:
+                self.character_style.update(ps)
+
        lt = None
        for lr in XPath('./w:lvlText[@w:val]')(lvl):
            lt = get(lr, 'w:val')
@ -80,7 +88,12 @@ class Level(object):
            val = get(lr, 'w:val')
            if val == 'bullet':
                self.is_numbered = False
-                self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                cs = self.character_style
+                if lt in {'\uf0a7', 'o'} or (
+                    cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
+                    self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
+                else:
+                    self.bullet_template = lt
                for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
                    self.pic_id = get(lpid, 'w:val')
            else:
@ -99,13 +112,6 @@ class Level(object):
            else:
                self.paragraph_style.update(ps)

-        for rPr in XPath('./w:rPr')(lvl):
-            ps = RunStyle(rPr)
-            if self.character_style is None:
-                self.character_style = ps
-            else:
-                self.character_style.update(ps)
-
    def css(self, images, pic_map, rid_map):
        ans = {'list-style-type': self.fmt}
        if self.pic_id:
@ -119,6 +125,14 @@ class Level(object):
                    ans['list-style-image'] = 'url("images/%s")' % fname
        return ans

+    def char_css(self):
+        try:
+            css = self.character_style.css
+        except AttributeError:
+            css = {}
+        css.pop('font-family', None)
+        return css
+
 class NumberingDefinition(object):

    def __init__(self, parent=None):
@ -233,7 +247,10 @@ class Numbering(object):
                    p.set('list-lvl', str(ilvl))
                    p.set('list-id', num_id)
                    if lvl.num_template is not None:
-                        val = lvl.format_template(counter, ilvl)
+                        val = lvl.format_template(counter, ilvl, lvl.num_template)
+                        p.set('list-template', val)
+                    elif lvl.bullet_template is not None:
+                        val = lvl.format_template(counter, ilvl, lvl.bullet_template)
                        p.set('list-template', val)
                    self.update_counter(counter, ilvl, d.levels)

@ -250,12 +267,15 @@ class Numbering(object):
            ilvl = int(start.get('list-lvl'))
            lvl = d.levels[ilvl]
            lvlid = start.get('list-id') + start.get('list-lvl')
-            wrap = (OL if lvl.is_numbered else UL)('\n\t')
            has_template = 'list-template' in start.attrib
+            wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
            if has_template:
                wrap.set('lvlid', lvlid)
            else:
                wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
+            ccss = lvl.char_css()
+            if ccss:
+                ccss = styles.register(ccss, 'bullet')
            parent.insert(idx, wrap)
            last_val = None
            for child in current_run:
@ -269,6 +289,8 @@ class Numbering(object):
                        span.append(gc)
                    child.append(span)
                    span = SPAN(child.get('list-template'))
+                    if ccss:
+                        span.set('class', ccss)
                    last = templates.get(lvlid, '')
                    if span.text and len(span.text) > len(last):
                        templates[lvlid] = span.text
@ -299,15 +321,12 @@ class Numbering(object):
                    commit(current_run)
            commit(current_run)

+        # Convert the list items that use custom text for bullets into tables
+        # so that they display correctly
        for wrap in body.xpath('//ol[@lvlid]'):
-            lvlid = wrap.attrib.pop('lvlid')
+            wrap.attrib.pop('lvlid')
            wrap.tag = 'div'
-            text = ''
-            maxtext = templates.get(lvlid, '').replace('.', '')[:-1]
-            for li in wrap.iterchildren('li'):
-                t = li[0].text
-                if t and len(t) > len(text):
-                    text = t
+            wrap.set('style', 'display:table')
            for i, li in enumerate(wrap.iterchildren('li')):
                li.tag = 'div'
                li.attrib.pop('value', None)
@ -315,9 +334,8 @@ class Numbering(object):
                obj = object_map[li]
                bs = styles.para_cache[obj]
                if i == 0:
-                    m = len(maxtext)  # Move the table left to simulate the behavior of a list (number is to the left of text margin)
-                    wrap.set('style', 'display:table; margin-left: -%dem; padding-left: %s' % (m, bs.css.get('margin-left', 0)))
+                    wrap.set('style', 'display:table; padding-left:%s' %
+                             bs.css.get('margin-left', '0'))
                bs.css.pop('margin-left', None)
                for child in li:
                    child.set('style', 'display:table-cell')
-
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -92,7 +92,7 @@ class Style(object):
                else:
                    self.character_style.update(rs)

-        if self.style_type == 'numbering':
+        if self.style_type in {'numbering', 'paragraph'}:
            self.numbering_style_link = None
            for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
                self.numbering_style_link = get(x, 'w:val')
@ -150,7 +150,7 @@ class Styles(object):
                self.id_map[s.style_id] = s
            if s.is_default:
                self.default_styles[s.style_type] = s
-            if s.style_type == 'numbering' and s.numbering_style_link:
+            if getattr(s, 'numbering_style_link', None) is not None:
                self.numbering_style_links[s.style_id] = s.numbering_style_link

        self.default_paragraph_style = self.default_character_style = None
@ -212,6 +212,7 @@ class Styles(object):
    def resolve_paragraph(self, p):
        ans = self.para_cache.get(p, None)
        if ans is None:
+            linked_style = None
            ans = self.para_cache[p] = ParagraphStyle()
            ans.style_name = None
            direct_formatting = None
@ -233,7 +234,7 @@ class Styles(object):

            default_para = self.default_styles.get('paragraph', None)
            if direct_formatting.linked_style is not None:
-                ls = self.get(direct_formatting.linked_style)
+                ls = linked_style = self.get(direct_formatting.linked_style)
                if ls is not None:
                    ans.style_name = ls.name
                    ps = ls.paragraph_style
@ -256,6 +257,11 @@ class Styles(object):
                    ps = self.numbering.get_para_style(num_id, lvl)
                    if ps is not None:
                        parent_styles.append(ps)
+            if not is_numbering and linked_style is not None and getattr(linked_style.paragraph_style, 'numbering', inherit) is not inherit:
+                num_id, lvl = linked_style.paragraph_style.numbering
+                if num_id is not None:
+                    p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
+                is_numbering = True

            for attr in ans.all_properties:
                if not (is_numbering and attr == 'text_indent'):  # skip text-indent for lists
@ -379,7 +385,7 @@ class Styles(object):

    def resolve_numbering(self, numbering):
        # When a numPr element appears inside a paragraph style, the lvl info
-        # must be discarder and pStyle used instead.
+        # must be discarded and pStyle used instead.
        self.numbering = numbering
        for style in self:
            ps = style.paragraph_style
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -30,6 +30,8 @@ from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

+NBSP = '\xa0'
+
 class Text:

    def __init__(self, elem, attr, buf):
@ -396,7 +398,17 @@ class Convert(object):
        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
-            dest.text = '\xa0'
+            dest.text = NBSP
+
+        # If the last element in a block is a <br> the <br> is not rendered in
+        # HTML, unless it is followed by a trailing space. Word, on the other
+        # hand inserts a blank line for trailing <br>s.
+        if len(dest) > 0 and not dest[-1].tail:
+            if dest[-1].tag == 'br':
+                dest[-1].tail = NBSP
+            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
+                dest[-1][-1].tail = NBSP
+
        return dest

    def wrap_elems(self, elems, wrapper):