From 95102dee294af3e6ca59a953f7dae25969e754de Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 22 May 2013 11:44:53 +0530 Subject: [PATCH] DOCX: Bookmarks --- src/calibre/ebooks/docx/names.py | 30 +++++++++++++++++++++++++++--- src/calibre/ebooks/docx/to_html.py | 24 ++++++++++++++++++------ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index d6cecdeeb6..6ff1cefa83 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -6,8 +6,12 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import re + from lxml.etree import XPath as X +from calibre.utils.filenames import ascii_text + DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' @@ -66,7 +70,27 @@ def barename(x): def XML(x): return '{%s}%s' % (namespaces['xml'], x) -def get(x, attr, default=None): - ns, name = attr.partition(':')[0::2] - return x.attrib.get('{%s}%s' % (namespaces[ns], name), default) +def expand(name): + ns, tag = name.partition(':')[0::2] + if ns: + tag = '{%s}%s' % (namespaces[ns], tag) + return tag + +def get(x, attr, default=None): + return x.attrib.get(expand(attr), default) + +def ancestor(elem, name): + tag = expand(name) + while elem is not None: + elem = elem.getparent() + if getattr(elem, 'tag', None) == tag: + return elem + +def generate_anchor(name, existing): + x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_') + c = 1 + while y in existing: + y = '%s_%d' % (x, c) + c += 1 + return y diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 3ac94e8da3..588fff3b4c 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -14,7 +14,7 @@ from lxml.html.builder import ( HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV) from calibre.ebooks.docx.container import DOCX, fromstring -from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS +from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS, expand, get, generate_anchor from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.fonts import Fonts @@ -70,6 +70,7 @@ class Convert(object): self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} + self.anchor_map = {} self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): @@ -141,7 +142,7 @@ class Convert(object): if name is None: cname = self.docx.document_name.split('/') cname[-1] = defname - if self.docx.exists(cname): + if self.docx.exists('/'.join(cname)): name = name return name @@ -193,10 +194,21 @@ class Convert(object): style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) - for run in XPath('descendant::w:r')(p): - span = self.convert_run(run) - dest.append(span) - self.layers[p].append(run) + + current_anchor = None + + for x in p.iterdescendants(expand('w:r'), expand('w:bookmarkStart')): + if x.tag.endswith('}r'): + span = self.convert_run(x) + if current_anchor is not None: + (dest if len(dest) == 0 else span).set('id', current_anchor) + current_anchor = None + dest.append(span) + self.layers[p].append(x) + elif x.tag.endswith('}bookmarkStart'): + anchor = get(x, 'w:name') + if anchor and anchor not in self.anchor_map: + self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: