From 3df90b8926895953d371b1b0618fbeb42d9b74e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 2 May 2010 14:47:52 -0600 Subject: [PATCH] Conversion pipeline: More robust conversion of tags to text when detecting structure --- src/calibre/ebooks/oeb/base.py | 3 +++ src/calibre/ebooks/oeb/transforms/structure.py | 10 ++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index f770622952..026c072845 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False): def xml2unicode(root, pretty_print=False): return etree.tostring(root, pretty_print=pretty_print) +def xml2text(elem): + return etree.tostring(elem, method='text', encoding=unicode) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 15e9675aa8..07235b4fb0 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -11,7 +11,7 @@ import re from lxml import etree from urlparse import urlparse -from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML +from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text from calibre.ebooks import ConversionError def XPath(x): @@ -79,8 +79,7 @@ class DetectStructure(object): page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' for item, elem in self.detected_chapters: - text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) - text = text.strip() + text = xml2text(elem).strip() self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue @@ -120,8 +119,7 @@ class DetectStructure(object): if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): - text = u' '.join([t.strip() for t in \ - a.xpath('descendant::text()')]) + text = xml2text(a) text = text[:100].strip() if not self.oeb.toc.has_text(text): num += 1 @@ -135,7 +133,7 @@ class DetectStructure(object): def elem_to_link(self, item, elem, counter): - text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + text = xml2text(elem) text = text[:100].strip() id = elem.get('id', 'calibre_toc_%d'%counter) elem.set('id', id)