mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
DOCX Input: Fix various issues with numbering->lists
DOCX Input: Handle numbered paragraphs where the numbering is specified in the paragraph style, instead of on the paragraph directly. Also support the use of arbitrary, styled text for bullets.
This commit is contained in:
parent
14c1c6aabe
commit
ec29e28944
@ -196,9 +196,9 @@ class RunStyle(object):
|
||||
td = set()
|
||||
if self.text_decoration is not inherit:
|
||||
td.add(self.text_decoration)
|
||||
if self.strike:
|
||||
if self.strike and self.strike is not inherit:
|
||||
td.add('line-through')
|
||||
if self.dstrike:
|
||||
if self.dstrike and self.dstrike is not inherit:
|
||||
td.add('line-through')
|
||||
if td:
|
||||
c['text-decoration'] = ' '.join(td)
|
||||
@ -206,7 +206,7 @@ class RunStyle(object):
|
||||
c['text-transform'] = 'uppercase'
|
||||
if self.i is True:
|
||||
c['font-style'] = 'italic'
|
||||
if self.shadow:
|
||||
if self.shadow and self.shadow is not inherit:
|
||||
c['text-shadow'] = '2px 2px'
|
||||
if self.smallCaps is True:
|
||||
c['font-variant'] = 'small-caps'
|
||||
|
@ -10,8 +10,6 @@ import os
|
||||
|
||||
from calibre.ebooks.docx.names import XPath
|
||||
|
||||
NBSP = '\xa0'
|
||||
|
||||
def mergeable(previous, current):
|
||||
if previous.tail or current.tail:
|
||||
return False
|
||||
@ -116,7 +114,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
|
||||
# Merge consecutive spans that have the same styling
|
||||
current_run = []
|
||||
for span in root.xpath('//span'):
|
||||
for span in root.xpath('//span[not(@style)]'):
|
||||
if not current_run:
|
||||
current_run.append(span)
|
||||
else:
|
||||
@ -149,7 +147,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
parent.append(child)
|
||||
|
||||
# Make spans whose only styling is bold or italic into <b> and <i> tags
|
||||
for span in root.xpath('//span[@class]'):
|
||||
for span in root.xpath('//span[@class and not(@style)]'):
|
||||
css = class_map.get(span.get('class', None), {})
|
||||
if len(css) == 1:
|
||||
if css == {'font-style':'italic'}:
|
||||
@ -160,15 +158,9 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
del span.attrib['class']
|
||||
|
||||
# Get rid of <span>s that have no styling
|
||||
for span in root.xpath('//span[not(@class) and not(@id)]'):
|
||||
for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
|
||||
lift(span)
|
||||
|
||||
# If a paragraph ends with a <br>, that <br> is not rendered in HTML, but
|
||||
# it is in Word, so add a trailing space to ensure it is rendered.
|
||||
for br in root.xpath('//*[contains("p,h1,h2,h3,h4,h5,h6,li", name())]/node()[position()=last()]/self::br'):
|
||||
if not br.tail:
|
||||
br.tail = NBSP
|
||||
|
||||
if detect_cover:
|
||||
# Check if the first image in the document is possibly a cover
|
||||
img = root.xpath('//img[@src][1]')
|
||||
|
@ -12,7 +12,7 @@ from collections import Counter
|
||||
from lxml.html.builder import OL, UL, SPAN
|
||||
|
||||
from calibre.ebooks.docx.block_styles import ParagraphStyle
|
||||
from calibre.ebooks.docx.char_styles import RunStyle
|
||||
from calibre.ebooks.docx.char_styles import RunStyle, inherit
|
||||
from calibre.ebooks.docx.names import XPath, get
|
||||
|
||||
STYLE_MAP = {
|
||||
@ -40,6 +40,7 @@ class Level(object):
|
||||
self.paragraph_style = self.character_style = None
|
||||
self.is_numbered = False
|
||||
self.num_template = None
|
||||
self.bullet_template = None
|
||||
self.pic_id = None
|
||||
|
||||
if lvl is not None:
|
||||
@ -47,17 +48,17 @@ class Level(object):
|
||||
|
||||
def copy(self):
|
||||
ans = Level()
|
||||
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template'):
|
||||
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
|
||||
setattr(ans, x, getattr(self, x))
|
||||
return ans
|
||||
|
||||
def format_template(self, counter, ilvl):
|
||||
def format_template(self, counter, ilvl, template):
|
||||
def sub(m):
|
||||
x = int(m.group(1)) - 1
|
||||
if x > ilvl or x not in counter:
|
||||
return ''
|
||||
return '%d' % (counter[x] - (0 if x == ilvl else 1))
|
||||
return re.sub(r'%(\d+)', sub, self.num_template).rstrip() + '\xa0'
|
||||
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
|
||||
|
||||
def read_from_xml(self, lvl, override=False):
|
||||
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
|
||||
@ -72,6 +73,13 @@ class Level(object):
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
for rPr in XPath('./w:rPr')(lvl):
|
||||
ps = RunStyle(rPr)
|
||||
if self.character_style is None:
|
||||
self.character_style = ps
|
||||
else:
|
||||
self.character_style.update(ps)
|
||||
|
||||
lt = None
|
||||
for lr in XPath('./w:lvlText[@w:val]')(lvl):
|
||||
lt = get(lr, 'w:val')
|
||||
@ -80,7 +88,12 @@ class Level(object):
|
||||
val = get(lr, 'w:val')
|
||||
if val == 'bullet':
|
||||
self.is_numbered = False
|
||||
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
|
||||
cs = self.character_style
|
||||
if lt in {'\uf0a7', 'o'} or (
|
||||
cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
|
||||
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
|
||||
else:
|
||||
self.bullet_template = lt
|
||||
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
|
||||
self.pic_id = get(lpid, 'w:val')
|
||||
else:
|
||||
@ -99,13 +112,6 @@ class Level(object):
|
||||
else:
|
||||
self.paragraph_style.update(ps)
|
||||
|
||||
for rPr in XPath('./w:rPr')(lvl):
|
||||
ps = RunStyle(rPr)
|
||||
if self.character_style is None:
|
||||
self.character_style = ps
|
||||
else:
|
||||
self.character_style.update(ps)
|
||||
|
||||
def css(self, images, pic_map, rid_map):
|
||||
ans = {'list-style-type': self.fmt}
|
||||
if self.pic_id:
|
||||
@ -119,6 +125,14 @@ class Level(object):
|
||||
ans['list-style-image'] = 'url("images/%s")' % fname
|
||||
return ans
|
||||
|
||||
def char_css(self):
|
||||
try:
|
||||
css = self.character_style.css
|
||||
except AttributeError:
|
||||
css = {}
|
||||
css.pop('font-family', None)
|
||||
return css
|
||||
|
||||
class NumberingDefinition(object):
|
||||
|
||||
def __init__(self, parent=None):
|
||||
@ -233,7 +247,10 @@ class Numbering(object):
|
||||
p.set('list-lvl', str(ilvl))
|
||||
p.set('list-id', num_id)
|
||||
if lvl.num_template is not None:
|
||||
val = lvl.format_template(counter, ilvl)
|
||||
val = lvl.format_template(counter, ilvl, lvl.num_template)
|
||||
p.set('list-template', val)
|
||||
elif lvl.bullet_template is not None:
|
||||
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
|
||||
p.set('list-template', val)
|
||||
self.update_counter(counter, ilvl, d.levels)
|
||||
|
||||
@ -250,12 +267,15 @@ class Numbering(object):
|
||||
ilvl = int(start.get('list-lvl'))
|
||||
lvl = d.levels[ilvl]
|
||||
lvlid = start.get('list-id') + start.get('list-lvl')
|
||||
wrap = (OL if lvl.is_numbered else UL)('\n\t')
|
||||
has_template = 'list-template' in start.attrib
|
||||
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
|
||||
if has_template:
|
||||
wrap.set('lvlid', lvlid)
|
||||
else:
|
||||
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
|
||||
ccss = lvl.char_css()
|
||||
if ccss:
|
||||
ccss = styles.register(ccss, 'bullet')
|
||||
parent.insert(idx, wrap)
|
||||
last_val = None
|
||||
for child in current_run:
|
||||
@ -269,6 +289,8 @@ class Numbering(object):
|
||||
span.append(gc)
|
||||
child.append(span)
|
||||
span = SPAN(child.get('list-template'))
|
||||
if ccss:
|
||||
span.set('class', ccss)
|
||||
last = templates.get(lvlid, '')
|
||||
if span.text and len(span.text) > len(last):
|
||||
templates[lvlid] = span.text
|
||||
@ -299,15 +321,12 @@ class Numbering(object):
|
||||
commit(current_run)
|
||||
commit(current_run)
|
||||
|
||||
# Convert the list items that use custom text for bullets into tables
|
||||
# so that they display correctly
|
||||
for wrap in body.xpath('//ol[@lvlid]'):
|
||||
lvlid = wrap.attrib.pop('lvlid')
|
||||
wrap.attrib.pop('lvlid')
|
||||
wrap.tag = 'div'
|
||||
text = ''
|
||||
maxtext = templates.get(lvlid, '').replace('.', '')[:-1]
|
||||
for li in wrap.iterchildren('li'):
|
||||
t = li[0].text
|
||||
if t and len(t) > len(text):
|
||||
text = t
|
||||
wrap.set('style', 'display:table')
|
||||
for i, li in enumerate(wrap.iterchildren('li')):
|
||||
li.tag = 'div'
|
||||
li.attrib.pop('value', None)
|
||||
@ -315,9 +334,8 @@ class Numbering(object):
|
||||
obj = object_map[li]
|
||||
bs = styles.para_cache[obj]
|
||||
if i == 0:
|
||||
m = len(maxtext) # Move the table left to simulate the behavior of a list (number is to the left of text margin)
|
||||
wrap.set('style', 'display:table; margin-left: -%dem; padding-left: %s' % (m, bs.css.get('margin-left', 0)))
|
||||
wrap.set('style', 'display:table; padding-left:%s' %
|
||||
bs.css.get('margin-left', '0'))
|
||||
bs.css.pop('margin-left', None)
|
||||
for child in li:
|
||||
child.set('style', 'display:table-cell')
|
||||
|
||||
|
@ -92,7 +92,7 @@ class Style(object):
|
||||
else:
|
||||
self.character_style.update(rs)
|
||||
|
||||
if self.style_type == 'numbering':
|
||||
if self.style_type in {'numbering', 'paragraph'}:
|
||||
self.numbering_style_link = None
|
||||
for x in XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
|
||||
self.numbering_style_link = get(x, 'w:val')
|
||||
@ -150,7 +150,7 @@ class Styles(object):
|
||||
self.id_map[s.style_id] = s
|
||||
if s.is_default:
|
||||
self.default_styles[s.style_type] = s
|
||||
if s.style_type == 'numbering' and s.numbering_style_link:
|
||||
if getattr(s, 'numbering_style_link', None) is not None:
|
||||
self.numbering_style_links[s.style_id] = s.numbering_style_link
|
||||
|
||||
self.default_paragraph_style = self.default_character_style = None
|
||||
@ -212,6 +212,7 @@ class Styles(object):
|
||||
def resolve_paragraph(self, p):
|
||||
ans = self.para_cache.get(p, None)
|
||||
if ans is None:
|
||||
linked_style = None
|
||||
ans = self.para_cache[p] = ParagraphStyle()
|
||||
ans.style_name = None
|
||||
direct_formatting = None
|
||||
@ -233,7 +234,7 @@ class Styles(object):
|
||||
|
||||
default_para = self.default_styles.get('paragraph', None)
|
||||
if direct_formatting.linked_style is not None:
|
||||
ls = self.get(direct_formatting.linked_style)
|
||||
ls = linked_style = self.get(direct_formatting.linked_style)
|
||||
if ls is not None:
|
||||
ans.style_name = ls.name
|
||||
ps = ls.paragraph_style
|
||||
@ -256,6 +257,11 @@ class Styles(object):
|
||||
ps = self.numbering.get_para_style(num_id, lvl)
|
||||
if ps is not None:
|
||||
parent_styles.append(ps)
|
||||
if not is_numbering and linked_style is not None and getattr(linked_style.paragraph_style, 'numbering', inherit) is not inherit:
|
||||
num_id, lvl = linked_style.paragraph_style.numbering
|
||||
if num_id is not None:
|
||||
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
|
||||
is_numbering = True
|
||||
|
||||
for attr in ans.all_properties:
|
||||
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
|
||||
@ -379,7 +385,7 @@ class Styles(object):
|
||||
|
||||
def resolve_numbering(self, numbering):
|
||||
# When a numPr element appears inside a paragraph style, the lvl info
|
||||
# must be discarder and pStyle used instead.
|
||||
# must be discarded and pStyle used instead.
|
||||
self.numbering = numbering
|
||||
for style in self:
|
||||
ps = style.paragraph_style
|
||||
|
@ -30,6 +30,8 @@ from calibre.ebooks.docx.fields import Fields
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
NBSP = '\xa0'
|
||||
|
||||
class Text:
|
||||
|
||||
def __init__(self, elem, attr, buf):
|
||||
@ -396,7 +398,17 @@ class Convert(object):
|
||||
if not dest.text and len(dest) == 0:
|
||||
# Empty paragraph add a non-breaking space so that it is rendered
|
||||
# by WebKit
|
||||
dest.text = '\xa0'
|
||||
dest.text = NBSP
|
||||
|
||||
# If the last element in a block is a <br> the <br> is not rendered in
|
||||
# HTML, unless it is followed by a trailing space. Word, on the other
|
||||
# hand inserts a blank line for trailing <br>s.
|
||||
if len(dest) > 0 and not dest[-1].tail:
|
||||
if dest[-1].tag == 'br':
|
||||
dest[-1].tail = NBSP
|
||||
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
|
||||
dest[-1][-1].tail = NBSP
|
||||
|
||||
return dest
|
||||
|
||||
def wrap_elems(self, elems, wrapper):
|
||||
|
Loading…
x
Reference in New Issue
Block a user