From 0bcaa648bde842e003f6138caab2599f4ed22a2a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 30 Nov 2009 11:25:51 -0700 Subject: [PATCH] Fix #4087 (Conversion from .LIT to .EPUB loses a chapter from the book) --- Changelog.yaml | 3 +++ src/calibre/ebooks/epub/output.py | 2 +- src/calibre/ebooks/lit/reader.py | 4 ++++ src/calibre/ebooks/oeb/base.py | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Changelog.yaml b/Changelog.yaml index 1e828dcfc1..7e8cd4b162 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -49,6 +49,9 @@ - title: Add 0x0c01 to the list of product ids for HTC Hero (Android) driver tickets: [4088] + - title: "LIT Input: Remove more invalid markup present in LIT files created by Microsoft Word plugins" + tickets: [4087] + new recipes: - title: The Economist (no subscription required) author: Kovid Goyal diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 45b0b18859..77a03916d8 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -312,7 +312,6 @@ class EPUBOutput(OutputFormatPlugin): for tag in XPath('//h:center')(root): tag.tag = XHTML('div') tag.set('style', 'text-align:center') - # ADE can't handle & in an img url for tag in XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) @@ -340,6 +339,7 @@ class EPUBOutput(OutputFormatPlugin): else: self.oeb.log.warn('No stylesheet found') + def workaround_sony_quirks(self): ''' Perform toc link transforms to alleviate slow loading. diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index ec7d858107..a00f393b39 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -866,6 +866,10 @@ class LitContainer(object): atoms = self._litfile.get_atoms(entry) unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms) content = HTML_DECL + str(unbin) + tags = ('personname', 'place', 'city', 'country-region') + pat = r'(?i)'%('|'.join(tags)) + content = re.sub(pat, '', content) + content = re.sub(r'<(/{0,1})form>', r'<\1div>', content) else: internal = '/'.join(('/data', entry.internal)) content = self._litfile.get_file(internal) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 05d4cbb256..11dab5c102 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -799,6 +799,7 @@ class Manifest(object): try: data = etree.fromstring(data) except etree.XMLSyntaxError, err: + self.log.exception('Initial parse failed:') repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) data = ENTITY_RE.sub(repl, data) try: @@ -843,6 +844,7 @@ class Manifest(object): # Force into the XHTML namespace if not namespace(data.tag): + self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace') data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode)