mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'docx'
Fixes #1196728 [Private bug](https://bugs.launchpad.net/calibre/+bug/1196728)
This commit is contained in:
commit
541db88ebf
@ -183,7 +183,7 @@ class DOCX(object):
|
|||||||
root = fromstring(raw)
|
root = fromstring(raw)
|
||||||
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
||||||
target = item.get('Target')
|
target = item.get('Target')
|
||||||
if item.get('TargetMode', None) != 'External':
|
if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
|
||||||
target = '/'.join((base, target.lstrip('/')))
|
target = '/'.join((base, target.lstrip('/')))
|
||||||
typ = item.get('Type')
|
typ = item.get('Type')
|
||||||
Id = item.get('Id')
|
Id = item.get('Id')
|
||||||
|
106
src/calibre/ebooks/docx/fields.py
Normal file
106
src/calibre/ebooks/docx/fields.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.ebooks.docx.names import XPath, get
|
||||||
|
|
||||||
|
class Field(object):
|
||||||
|
|
||||||
|
def __init__(self, start):
|
||||||
|
self.start = start
|
||||||
|
self.end = None
|
||||||
|
self.contents = []
|
||||||
|
self.instructions = []
|
||||||
|
|
||||||
|
def add_instr(self, elem):
|
||||||
|
raw = elem.text
|
||||||
|
if not raw:
|
||||||
|
return
|
||||||
|
name, rest = raw.strip().partition(' ')[0::2]
|
||||||
|
self.instructions.append((name, rest.strip()))
|
||||||
|
|
||||||
|
WORD, FLAG = 0, 1
|
||||||
|
scanner = re.Scanner([
|
||||||
|
(r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x
|
||||||
|
(r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word
|
||||||
|
(r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote
|
||||||
|
(r'\s+', None),
|
||||||
|
], flags=re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_hyperlink(raw, log):
|
||||||
|
ans = {}
|
||||||
|
last_option = None
|
||||||
|
for token, token_type in scanner.scan(raw)[0]:
|
||||||
|
if not ans:
|
||||||
|
if token_type is not WORD:
|
||||||
|
log('Invalid hyperlink, first token is not a URL (%s)' % raw)
|
||||||
|
return ans
|
||||||
|
ans['url'] = token
|
||||||
|
if token_type is FLAG:
|
||||||
|
last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
|
||||||
|
if last_option is not None:
|
||||||
|
ans[last_option] = None
|
||||||
|
elif token_type is WORD:
|
||||||
|
if last_option is not None:
|
||||||
|
ans[last_option] = token
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
class Fields(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.fields = []
|
||||||
|
|
||||||
|
def __call__(self, doc, log):
|
||||||
|
stack = []
|
||||||
|
for elem in XPath(
|
||||||
|
'//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
|
||||||
|
if elem.tag.endswith('}fldChar'):
|
||||||
|
typ = get(elem, 'w:fldCharType')
|
||||||
|
if typ == 'begin':
|
||||||
|
stack.append(Field(elem))
|
||||||
|
self.fields.append(stack[-1])
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
stack.pop().end = elem
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
elif elem.tag.endswith('}instrText'):
|
||||||
|
if stack:
|
||||||
|
stack[-1].add_instr(elem)
|
||||||
|
else:
|
||||||
|
if stack:
|
||||||
|
stack[-1].contents.append(elem)
|
||||||
|
|
||||||
|
# Parse hyperlink fields
|
||||||
|
self.hyperlink_fields = []
|
||||||
|
for field in self.fields:
|
||||||
|
if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
|
||||||
|
hl = parse_hyperlink(field.instructions[0][1], log)
|
||||||
|
if hl:
|
||||||
|
if 'target' in hl and hl['target'] is None:
|
||||||
|
hl['target'] = '_blank'
|
||||||
|
all_runs = []
|
||||||
|
current_runs = []
|
||||||
|
# We only handle spans in a single paragraph
|
||||||
|
# being wrapped in <a>
|
||||||
|
for x in field.contents:
|
||||||
|
if x.tag.endswith('}p'):
|
||||||
|
if current_runs:
|
||||||
|
all_runs.append(current_runs)
|
||||||
|
current_runs = []
|
||||||
|
elif x.tag.endswith('}r'):
|
||||||
|
current_runs.append(x)
|
||||||
|
if current_runs:
|
||||||
|
all_runs.append(current_runs)
|
||||||
|
for runs in all_runs:
|
||||||
|
self.hyperlink_fields.append((hl, runs))
|
||||||
|
|
||||||
|
|
@ -96,6 +96,7 @@ class Images(object):
|
|||||||
self.used = {}
|
self.used = {}
|
||||||
self.names = set()
|
self.names = set()
|
||||||
self.all_images = set()
|
self.all_images = set()
|
||||||
|
self.links = []
|
||||||
|
|
||||||
def __call__(self, relationships_by_id):
|
def __call__(self, relationships_by_id):
|
||||||
self.rid_map = relationships_by_id
|
self.rid_map = relationships_by_id
|
||||||
@ -125,8 +126,18 @@ class Images(object):
|
|||||||
self.all_images.add('images/' + name)
|
self.all_images.add('images/' + name)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def pic_to_img(self, pic, alt=None):
|
def pic_to_img(self, pic, alt, parent):
|
||||||
name = None
|
name = None
|
||||||
|
link = None
|
||||||
|
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
|
||||||
|
link = {'id':get(hl, 'r:id')}
|
||||||
|
tgt = hl.get('tgtFrame', None)
|
||||||
|
if tgt:
|
||||||
|
link['target'] = tgt
|
||||||
|
title = hl.get('tooltip', None)
|
||||||
|
if title:
|
||||||
|
link['title'] = title
|
||||||
|
|
||||||
for pr in XPath('descendant::pic:cNvPr')(pic):
|
for pr in XPath('descendant::pic:cNvPr')(pic):
|
||||||
name = pr.get('name', None)
|
name = pr.get('name', None)
|
||||||
if name:
|
if name:
|
||||||
@ -138,6 +149,8 @@ class Images(object):
|
|||||||
src = self.generate_filename(rid, name)
|
src = self.generate_filename(rid, name)
|
||||||
img = IMG(src='images/%s' % src)
|
img = IMG(src='images/%s' % src)
|
||||||
img.set('alt', alt or 'Image')
|
img.set('alt', alt or 'Image')
|
||||||
|
if link is not None:
|
||||||
|
self.links.append((img, link))
|
||||||
return img
|
return img
|
||||||
|
|
||||||
def drawing_to_html(self, drawing, page):
|
def drawing_to_html(self, drawing, page):
|
||||||
@ -145,7 +158,7 @@ class Images(object):
|
|||||||
for inline in XPath('./wp:inline')(drawing):
|
for inline in XPath('./wp:inline')(drawing):
|
||||||
style, alt = get_image_properties(inline)
|
style, alt = get_image_properties(inline)
|
||||||
for pic in XPath('descendant::pic:pic')(inline):
|
for pic in XPath('descendant::pic:pic')(inline):
|
||||||
ans = self.pic_to_img(pic, alt)
|
ans = self.pic_to_img(pic, alt, inline)
|
||||||
if ans is not None:
|
if ans is not None:
|
||||||
if style:
|
if style:
|
||||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
||||||
@ -156,7 +169,7 @@ class Images(object):
|
|||||||
style, alt = get_image_properties(anchor)
|
style, alt = get_image_properties(anchor)
|
||||||
self.get_float_properties(anchor, style, page)
|
self.get_float_properties(anchor, style, page)
|
||||||
for pic in XPath('descendant::pic:pic')(anchor):
|
for pic in XPath('descendant::pic:pic')(anchor):
|
||||||
ans = self.pic_to_img(pic, alt)
|
ans = self.pic_to_img(pic, alt, anchor)
|
||||||
if ans is not None:
|
if ans is not None:
|
||||||
if style:
|
if style:
|
||||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
||||||
|
@ -403,6 +403,11 @@ class Styles(object):
|
|||||||
ps.margin_top = 0
|
ps.margin_top = 0
|
||||||
last_para = p
|
last_para = p
|
||||||
|
|
||||||
|
def apply_section_page_breaks(self, paras):
|
||||||
|
for p in paras:
|
||||||
|
ps = self.resolve_paragraph(p)
|
||||||
|
ps.pageBreakBefore = True
|
||||||
|
|
||||||
def register(self, css, prefix):
|
def register(self, css, prefix):
|
||||||
h = hash(frozenset(css.iteritems()))
|
h = hash(frozenset(css.iteritems()))
|
||||||
ans, _ = self.classes.get(h, (None, None))
|
ans, _ = self.classes.get(h, (None, None))
|
||||||
|
@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes
|
|||||||
from calibre.ebooks.docx.cleanup import cleanup_markup
|
from calibre.ebooks.docx.cleanup import cleanup_markup
|
||||||
from calibre.ebooks.docx.theme import Theme
|
from calibre.ebooks.docx.theme import Theme
|
||||||
from calibre.ebooks.docx.toc import create_toc
|
from calibre.ebooks.docx.toc import create_toc
|
||||||
|
from calibre.ebooks.docx.fields import Fields
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||||
|
|
||||||
@ -52,6 +53,7 @@ class Convert(object):
|
|||||||
self.body = BODY()
|
self.body = BODY()
|
||||||
self.theme = Theme()
|
self.theme = Theme()
|
||||||
self.tables = Tables()
|
self.tables = Tables()
|
||||||
|
self.fields = Fields()
|
||||||
self.styles = Styles(self.tables)
|
self.styles = Styles(self.tables)
|
||||||
self.images = Images()
|
self.images = Images()
|
||||||
self.object_map = OrderedDict()
|
self.object_map = OrderedDict()
|
||||||
@ -79,6 +81,7 @@ class Convert(object):
|
|||||||
def __call__(self):
|
def __call__(self):
|
||||||
doc = self.docx.document
|
doc = self.docx.document
|
||||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||||
|
self.fields(doc, self.log)
|
||||||
self.read_styles(relationships_by_type)
|
self.read_styles(relationships_by_type)
|
||||||
self.images(relationships_by_id)
|
self.images(relationships_by_id)
|
||||||
self.layers = OrderedDict()
|
self.layers = OrderedDict()
|
||||||
@ -96,7 +99,11 @@ class Convert(object):
|
|||||||
p = self.convert_p(wp)
|
p = self.convert_p(wp)
|
||||||
self.body.append(p)
|
self.body.append(p)
|
||||||
paras.append(wp)
|
paras.append(wp)
|
||||||
|
self.read_block_anchors(doc)
|
||||||
self.styles.apply_contextual_spacing(paras)
|
self.styles.apply_contextual_spacing(paras)
|
||||||
|
# Apply page breaks at the start of every section, except the first
|
||||||
|
# section (since that will be the start of the file)
|
||||||
|
self.styles.apply_section_page_breaks(self.section_starts[1:])
|
||||||
|
|
||||||
notes_header = None
|
notes_header = None
|
||||||
if self.footnotes.has_notes:
|
if self.footnotes.has_notes:
|
||||||
@ -177,6 +184,7 @@ class Convert(object):
|
|||||||
def read_page_properties(self, doc):
|
def read_page_properties(self, doc):
|
||||||
current = []
|
current = []
|
||||||
self.page_map = OrderedDict()
|
self.page_map = OrderedDict()
|
||||||
|
self.section_starts = []
|
||||||
|
|
||||||
for p in descendants(doc, 'w:p', 'w:tbl'):
|
for p in descendants(doc, 'w:p', 'w:tbl'):
|
||||||
if p.tag.endswith('}tbl'):
|
if p.tag.endswith('}tbl'):
|
||||||
@ -186,8 +194,10 @@ class Convert(object):
|
|||||||
sect = tuple(descendants(p, 'w:sectPr'))
|
sect = tuple(descendants(p, 'w:sectPr'))
|
||||||
if sect:
|
if sect:
|
||||||
pr = PageProperties(sect)
|
pr = PageProperties(sect)
|
||||||
for x in current + [p]:
|
paras = current + [p]
|
||||||
|
for x in paras:
|
||||||
self.page_map[x] = pr
|
self.page_map[x] = pr
|
||||||
|
self.section_starts.append(paras[0])
|
||||||
current = []
|
current = []
|
||||||
else:
|
else:
|
||||||
current.append(p)
|
current.append(p)
|
||||||
@ -287,6 +297,22 @@ class Convert(object):
|
|||||||
opf.render(of, ncx, 'toc.ncx')
|
opf.render(of, ncx, 'toc.ncx')
|
||||||
return os.path.join(self.dest_dir, 'metadata.opf')
|
return os.path.join(self.dest_dir, 'metadata.opf')
|
||||||
|
|
||||||
|
def read_block_anchors(self, doc):
|
||||||
|
doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
|
||||||
|
if doc_anchors:
|
||||||
|
current_bm = None
|
||||||
|
rmap = {v:k for k, v in self.object_map.iteritems()}
|
||||||
|
for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
|
||||||
|
if p.tag.endswith('}p'):
|
||||||
|
if current_bm and p in rmap:
|
||||||
|
para = rmap[p]
|
||||||
|
if 'id' not in para.attrib:
|
||||||
|
para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues())))
|
||||||
|
self.anchor_map[current_bm] = para.get('id')
|
||||||
|
current_bm = None
|
||||||
|
elif p in doc_anchors:
|
||||||
|
current_bm = get(p, 'w:name')
|
||||||
|
|
||||||
def convert_p(self, p):
|
def convert_p(self, p):
|
||||||
dest = P()
|
dest = P()
|
||||||
self.object_map[dest] = p
|
self.object_map[dest] = p
|
||||||
@ -316,7 +342,13 @@ class Convert(object):
|
|||||||
elif x.tag.endswith('}bookmarkStart'):
|
elif x.tag.endswith('}bookmarkStart'):
|
||||||
anchor = get(x, 'w:name')
|
anchor = get(x, 'w:name')
|
||||||
if anchor and anchor not in self.anchor_map:
|
if anchor and anchor not in self.anchor_map:
|
||||||
|
old_anchor = current_anchor
|
||||||
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
|
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
|
||||||
|
if old_anchor is not None:
|
||||||
|
# The previous anchor was not applied to any element
|
||||||
|
for a, t in tuple(self.anchor_map.iteritems()):
|
||||||
|
if t == old_anchor:
|
||||||
|
self.anchor_map[a] = current_anchor
|
||||||
elif x.tag.endswith('}hyperlink'):
|
elif x.tag.endswith('}hyperlink'):
|
||||||
current_hyperlink = x
|
current_hyperlink = x
|
||||||
|
|
||||||
@ -396,6 +428,46 @@ class Convert(object):
|
|||||||
# hrefs that point nowhere give epubcheck a hernia. The element
|
# hrefs that point nowhere give epubcheck a hernia. The element
|
||||||
# should be styled explicitly by Word anyway.
|
# should be styled explicitly by Word anyway.
|
||||||
# span.set('href', '#')
|
# span.set('href', '#')
|
||||||
|
rmap = {v:k for k, v in self.object_map.iteritems()}
|
||||||
|
for hyperlink, runs in self.fields.hyperlink_fields:
|
||||||
|
spans = [rmap[r] for r in runs if r in rmap]
|
||||||
|
if not spans:
|
||||||
|
continue
|
||||||
|
if len(spans) > 1:
|
||||||
|
span = self.wrap_elems(spans, SPAN())
|
||||||
|
span.tag = 'a'
|
||||||
|
tgt = hyperlink.get('target', None)
|
||||||
|
if tgt:
|
||||||
|
span.set('target', tgt)
|
||||||
|
tt = hyperlink.get('title', None)
|
||||||
|
if tt:
|
||||||
|
span.set('title', tt)
|
||||||
|
url = hyperlink['url']
|
||||||
|
if url in self.anchor_map:
|
||||||
|
span.set('href', '#' + self.anchor_map[url])
|
||||||
|
continue
|
||||||
|
span.set('href', url)
|
||||||
|
|
||||||
|
for img, link in self.images.links:
|
||||||
|
parent = img.getparent()
|
||||||
|
idx = parent.index(img)
|
||||||
|
a = A(img)
|
||||||
|
a.tail, img.tail = img.tail, None
|
||||||
|
parent.insert(idx, a)
|
||||||
|
tgt = link.get('target', None)
|
||||||
|
if tgt:
|
||||||
|
a.set('target', tgt)
|
||||||
|
tt = link.get('title', None)
|
||||||
|
if tt:
|
||||||
|
a.set('title', tt)
|
||||||
|
rid = link['id']
|
||||||
|
if rid in relationships_by_id:
|
||||||
|
dest = relationships_by_id[rid]
|
||||||
|
if dest.startswith('#'):
|
||||||
|
if dest[1:] in self.anchor_map:
|
||||||
|
a.set('href', '#' + self.anchor_map[dest[1:]])
|
||||||
|
else:
|
||||||
|
a.set('href', dest)
|
||||||
|
|
||||||
def convert_run(self, run):
|
def convert_run(self, run):
|
||||||
ans = SPAN()
|
ans = SPAN()
|
||||||
|
@ -339,6 +339,8 @@ class FlowSplitter(object):
|
|||||||
# We want to keep the descendants of the split point in
|
# We want to keep the descendants of the split point in
|
||||||
# Tree 1
|
# Tree 1
|
||||||
keep_descendants = True
|
keep_descendants = True
|
||||||
|
# We want the split point element, but not its tail
|
||||||
|
elem.tail = '\n'
|
||||||
|
|
||||||
continue
|
continue
|
||||||
if hit_split_point:
|
if hit_split_point:
|
||||||
@ -357,6 +359,18 @@ class FlowSplitter(object):
|
|||||||
for elem in tuple(body2.iterdescendants()):
|
for elem in tuple(body2.iterdescendants()):
|
||||||
if elem is split_point2:
|
if elem is split_point2:
|
||||||
if not before:
|
if not before:
|
||||||
|
# Keep the split point element's tail, if it contains non-whitespace
|
||||||
|
# text
|
||||||
|
tail = elem.tail
|
||||||
|
if tail and not tail.isspace():
|
||||||
|
parent = elem.getparent()
|
||||||
|
idx = parent.index(elem)
|
||||||
|
if idx == 0:
|
||||||
|
parent.text = (parent.text or '') + tail
|
||||||
|
else:
|
||||||
|
sib = parent[idx-1]
|
||||||
|
sib.tail = (sib.tail or '') + tail
|
||||||
|
# Remove the element itself
|
||||||
nix_element(elem)
|
nix_element(elem)
|
||||||
break
|
break
|
||||||
if elem in ancestors:
|
if elem in ancestors:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user