DOCX Input: Add support for multilingual documents. Fixes #1394428 [docx driver doesn't handle different languages](https://bugs.launchpad.net/calibre/+bug/1394428)

This commit is contained in:
Kovid Goyal 2014-11-20 08:27:06 +05:30
parent 35406e21d8
commit 00ea2cc4a4
2 changed files with 20 additions and 8 deletions

View File

@ -115,7 +115,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span[not(@style)]'):
for span in root.xpath('//span[not(@style or @lang)]'):
if not current_run:
current_run.append(span)
else:
@ -144,6 +144,8 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
parent.set('class', pclass)
parent.text = span.text
parent.remove(span)
if span.get('lang'):
parent.set('lang', span.get('lang'))
for child in span:
parent.append(child)
@ -159,7 +161,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
for span in root.xpath('//span[not(@class or @id or @style or @lang)]'):
lift(span)
# Convert <p><br style="page-break-after:always"> </p> style page breaks

View File

@ -43,6 +43,13 @@ class Text:
setattr(self.elem, self.attr, ''.join(self.buf))
self.elem, self.attr, self.buf = elem, 'tail', []
def html_lang(docx_lang):
lang = canonicalize_lang(docx_lang)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
return lang
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
@ -77,11 +84,12 @@ class Convert(object):
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
self.html[1].text = self.html[1].tail = '\n'
lang = canonicalize_lang(self.mi.language)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
self.html.set('lang', lang)
lang = html_lang(self.mi.language)
if lang:
self.html.set('lang', lang)
self.doc_lang = lang
else:
self.doc_lang = None
def __call__(self):
doc = self.docx.document
@ -626,7 +634,9 @@ class Convert(object):
if style.vert_align in {'superscript', 'subscript'}:
ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
if style.lang is not inherit:
ans.lang = style.lang
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
ans.set('lang', lang)
return ans
def add_frame(self, html_obj, style):