From 00ea2cc4a48b4d10d6710fadb59a0a430789752d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 20 Nov 2014 08:27:06 +0530
Subject: [PATCH] DOCX Input: Add support for multilingual documents. Fixes
 #1394428 [docx driver doesn't handle different
 languages](https://bugs.launchpad.net/calibre/+bug/1394428)

---
 src/calibre/ebooks/docx/cleanup.py |  6 ++++--
 src/calibre/ebooks/docx/to_html.py | 22 ++++++++++++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py
index 8fb3b60592..6aec7399ab 100644
--- a/src/calibre/ebooks/docx/cleanup.py
+++ b/src/calibre/ebooks/docx/cleanup.py
@@ -115,7 +115,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
 
     # Merge consecutive spans that have the same styling
     current_run = []
-    for span in root.xpath('//span[not(@style)]'):
+    for span in root.xpath('//span[not(@style or @lang)]'):
         if not current_run:
             current_run.append(span)
         else:
@@ -144,6 +144,8 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                     parent.set('class', pclass)
                 parent.text = span.text
                 parent.remove(span)
+                if span.get('lang'):
+                    parent.set('lang', span.get('lang'))
                 for child in span:
                     parent.append(child)
 
@@ -159,7 +161,7 @@ def cleanup_markup(log, root, styles, dest_dir, detect_cover):
                 del span.attrib['class']
 
     # Get rid of <span>s that have no styling
-    for span in root.xpath('//span[not(@class) and not(@id) and not(@style)]'):
+    for span in root.xpath('//span[not(@class or @id or @style or @lang)]'):
         lift(span)
 
     # Convert <p><br style="page-break-after:always"> </p> style page breaks
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
index b18496fcc9..eac3aae6ee 100644
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -43,6 +43,13 @@ class Text:
         setattr(self.elem, self.attr, ''.join(self.buf))
         self.elem, self.attr, self.buf = elem, 'tail', []
 
+def html_lang(docx_lang):
+    lang = canonicalize_lang(docx_lang)
+    if lang and lang != 'und':
+        lang = lang_as_iso639_1(lang)
+        if lang:
+            return lang
+
 class Convert(object):
 
     def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
@@ -77,11 +84,12 @@ class Convert(object):
             child.tail = '\n\t\t'
         self.html[0][-1].tail = '\n\t'
         self.html[1].text = self.html[1].tail = '\n'
-        lang = canonicalize_lang(self.mi.language)
-        if lang and lang != 'und':
-            lang = lang_as_iso639_1(lang)
-            if lang:
-                self.html.set('lang', lang)
+        lang = html_lang(self.mi.language)
+        if lang:
+            self.html.set('lang', lang)
+            self.doc_lang = lang
+        else:
+            self.doc_lang = None
 
     def __call__(self):
         doc = self.docx.document
@@ -626,7 +634,9 @@ class Convert(object):
         if style.vert_align in {'superscript', 'subscript'}:
             ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
         if style.lang is not inherit:
-            ans.lang = style.lang
+            lang = html_lang(style.lang)
+            if lang is not None and lang != self.doc_lang:
+                ans.set('lang', lang)
         return ans
 
     def add_frame(self, html_obj, style):