From 0bcaa648bde842e003f6138caab2599f4ed22a2a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 30 Nov 2009 11:25:51 -0700
Subject: [PATCH] Fix #4087 (Conversion from .LIT to .EPUB loses a chapter from
 the book)

---
 Changelog.yaml                    | 3 +++
 src/calibre/ebooks/epub/output.py | 2 +-
 src/calibre/ebooks/lit/reader.py  | 4 ++++
 src/calibre/ebooks/oeb/base.py    | 2 ++
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/Changelog.yaml b/Changelog.yaml
index 1e828dcfc1..7e8cd4b162 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -49,6 +49,9 @@
     - title: Add 0x0c01 to the list of product ids for HTC Hero (Android) driver
       tickets: [4088]
 
+    - title: "LIT Input: Remove more invalid markup present in LIT files created by Microsoft Word plugins"
+      tickets: [4087]
+
   new recipes:
     - title: The Economist (no subscription required)
       author: Kovid Goyal
diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py
index 45b0b18859..77a03916d8 100644
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@@ -312,7 +312,6 @@ class EPUBOutput(OutputFormatPlugin):
             for tag in XPath('//h:center')(root):
                 tag.tag = XHTML('div')
                 tag.set('style', 'text-align:center')
-
             # ADE can't handle &amp; in an img url
             for tag in XPath('//h:img[@src]')(root):
                 tag.set('src', tag.get('src', '').replace('&', ''))
@@ -340,6 +339,7 @@ class EPUBOutput(OutputFormatPlugin):
         else:
             self.oeb.log.warn('No stylesheet found')
 
+
     def workaround_sony_quirks(self):
         '''
         Perform toc link transforms to alleviate slow loading.
diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py
index ec7d858107..a00f393b39 100644
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@@ -866,6 +866,10 @@ class LitContainer(object):
             atoms = self._litfile.get_atoms(entry)
             unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
             content = HTML_DECL + str(unbin)
+            tags = ('personname', 'place', 'city', 'country-region')
+            pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags))
+            content = re.sub(pat, '', content)
+            content = re.sub(r'<(/{0,1})form>', r'<\1div>', content)
         else:
             internal = '/'.join(('/data', entry.internal))
             content = self._litfile.get_file(internal)
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 05d4cbb256..11dab5c102 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -799,6 +799,7 @@ class Manifest(object):
                 try:
                     data = etree.fromstring(data)
                 except etree.XMLSyntaxError, err:
+                    self.log.exception('Initial parse failed:')
                     repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
                     data = ENTITY_RE.sub(repl, data)
                     try:
@@ -843,6 +844,7 @@ class Manifest(object):
 
             # Force into the XHTML namespace
             if not namespace(data.tag):
+                self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
                 data.attrib['xmlns'] = XHTML_NS
                 data = etree.tostring(data, encoding=unicode)