From 00ea2cc4a48b4d10d6710fadb59a0a430789752d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 Nov 2014 08:27:06 +0530 Subject: [PATCH] DOCX Input: Add support for multilingual documents. Fixes #1394428 [docx driver doesn't handle different languages](https://bugs.launchpad.net/calibre/+bug/1394428) --- src/calibre/ebooks/docx/cleanup.py | 6 ++++-- src/calibre/ebooks/docx/to_html.py | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 8fb3b60592..6aec7399ab 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -115,7 +115,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): # Merge consecutive spans that have the same styling current_run = [] - for span in root.xpath('//span[not(@style)]'): + for span in root.xpath('//span[not(@style or @lang)]'): if not current_run: current_run.append(span) else: @@ -144,6 +144,8 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): parent.set('class', pclass) parent.text = span.text parent.remove(span) + if span.get('lang'): + parent.set('lang', span.get('lang')) for child in span: parent.append(child) @@ -159,7 +161,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover): del span.attrib['class'] # Get rid of s that have no styling - for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'): + for span in root.xpath('//span[not(@class or @id or @style or @lang)]'): lift(span) # Convert


style page breaks diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index b18496fcc9..eac3aae6ee 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -43,6 +43,13 @@ class Text: setattr(self.elem, self.attr, ''.join(self.buf)) self.elem, self.attr, self.buf = elem, 'tail', [] +def html_lang(docx_lang): + lang = canonicalize_lang(docx_lang) + if lang and lang != 'und': + lang = lang_as_iso639_1(lang) + if lang: + return lang + class Convert(object): def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): @@ -77,11 +84,12 @@ class Convert(object): child.tail = '\n\t\t' self.html[0][-1].tail = '\n\t' self.html[1].text = self.html[1].tail = '\n' - lang = canonicalize_lang(self.mi.language) - if lang and lang != 'und': - lang = lang_as_iso639_1(lang) - if lang: - self.html.set('lang', lang) + lang = html_lang(self.mi.language) + if lang: + self.html.set('lang', lang) + self.doc_lang = lang + else: + self.doc_lang = None def __call__(self): doc = self.docx.document @@ -626,7 +634,9 @@ class Convert(object): if style.vert_align in {'superscript', 'subscript'}: ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup' if style.lang is not inherit: - ans.lang = style.lang + lang = html_lang(style.lang) + if lang is not None and lang != self.doc_lang: + ans.set('lang', lang) return ans def add_frame(self, html_obj, style):