mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
DOCX Input: Add support for multilingual documents. Fixes #1394428 [docx driver doesn't handle different languages](https://bugs.launchpad.net/calibre/+bug/1394428)
This commit is contained in:
parent
35406e21d8
commit
00ea2cc4a4
@ -115,7 +115,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
|
||||
# Merge consecutive spans that have the same styling
|
||||
current_run = []
|
||||
for span in root.xpath('//span[not(@style)]'):
|
||||
for span in root.xpath('//span[not(@style or @lang)]'):
|
||||
if not current_run:
|
||||
current_run.append(span)
|
||||
else:
|
||||
@ -144,6 +144,8 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
parent.set('class', pclass)
|
||||
parent.text = span.text
|
||||
parent.remove(span)
|
||||
if span.get('lang'):
|
||||
parent.set('lang', span.get('lang'))
|
||||
for child in span:
|
||||
parent.append(child)
|
||||
|
||||
@ -159,7 +161,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
|
||||
del span.attrib['class']
|
||||
|
||||
# Get rid of <span>s that have no styling
|
||||
for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
|
||||
for span in root.xpath('//span[not(@class or @id or @style or @lang)]'):
|
||||
lift(span)
|
||||
|
||||
# Convert <p><br style="page-break-after:always"> </p> style page breaks
|
||||
|
@ -43,6 +43,13 @@ class Text:
|
||||
setattr(self.elem, self.attr, ''.join(self.buf))
|
||||
self.elem, self.attr, self.buf = elem, 'tail', []
|
||||
|
||||
def html_lang(docx_lang):
|
||||
lang = canonicalize_lang(docx_lang)
|
||||
if lang and lang != 'und':
|
||||
lang = lang_as_iso639_1(lang)
|
||||
if lang:
|
||||
return lang
|
||||
|
||||
class Convert(object):
|
||||
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
|
||||
@ -77,11 +84,12 @@ class Convert(object):
|
||||
child.tail = '\n\t\t'
|
||||
self.html[0][-1].tail = '\n\t'
|
||||
self.html[1].text = self.html[1].tail = '\n'
|
||||
lang = canonicalize_lang(self.mi.language)
|
||||
if lang and lang != 'und':
|
||||
lang = lang_as_iso639_1(lang)
|
||||
lang = html_lang(self.mi.language)
|
||||
if lang:
|
||||
self.html.set('lang', lang)
|
||||
self.doc_lang = lang
|
||||
else:
|
||||
self.doc_lang = None
|
||||
|
||||
def __call__(self):
|
||||
doc = self.docx.document
|
||||
@ -626,7 +634,9 @@ class Convert(object):
|
||||
if style.vert_align in {'superscript', 'subscript'}:
|
||||
ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
|
||||
if style.lang is not inherit:
|
||||
ans.lang = style.lang
|
||||
lang = html_lang(style.lang)
|
||||
if lang is not None and lang != self.doc_lang:
|
||||
ans.set('lang', lang)
|
||||
return ans
|
||||
|
||||
def add_frame(self, html_obj, style):
|
||||
|
Loading…
x
Reference in New Issue
Block a user