From f31692c268cb8c02beaacd9dc2ac999d813971a6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 27 Sep 2009 23:43:04 -0600 Subject: [PATCH] Conversion pipeline: Remove empty and tags. Fixes #3564 (PDF to EPUB formatting problems) --- src/calibre/ebooks/oeb/base.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 5ee829c8f4..5e3d2296ae 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -912,23 +912,27 @@ class Manifest(object): if key == 'lang' or key.endswith('}lang'): body.attrib.pop(key) + def remove_elem(a): + p = a.getparent() + idx = p.index(a) -1 + p.remove(a) + if a.tail: + if idx <= 0: + if p.text is None: + p.text = '' + p.text += a.tail + else: + if p[idx].tail is None: + p[idx].tail = '' + p[idx].tail += a.tail + # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers - for a in xpath(data, '//h:a[@href]'): + # Also remove empty and tags + for a in xpath(data, '//h:a[@href]|//h:i|//h:b'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: - p = a.getparent() - idx = p.index(a) -1 - p.remove(a) - if a.tail: - if idx <= 0: - if p.text is None: - p.text = '' - p.text += a.tail - else: - if p[idx].tail is None: - p[idx].tail = '' - p[idx].tail += a.tail + remove_elem(a) return data