Conversion pipeline: More robust conversion of tags to text when detecting structure

2025-07-09 03:04:10 -04:00 · 2010-05-02 14:47:52 -06:00 · 2010-05-02 14:47:52 -06:00 · 3df90b8926
commit 3df90b8926
parent 8dc171ee3a
2 changed files with 7 additions and 6 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False):
 def xml2unicode(root, pretty_print=False):
    return etree.tostring(root, pretty_print=pretty_print)

+def xml2text(elem):
+    return etree.tostring(elem, method='text', encoding=unicode)
+
 ASCII_CHARS   = set(chr(x) for x in xrange(128))
 UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
 URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@ -11,7 +11,7 @@ import re
 from lxml import etree
 from urlparse import urlparse

-from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
+from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text
 from calibre.ebooks import ConversionError

 def XPath(x):
@ -79,8 +79,7 @@ class DetectStructure(object):
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            for item, elem in self.detected_chapters:
-                text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
-                text = text.strip()
+                text = xml2text(elem).strip()
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
@ -120,8 +119,7 @@ class DetectStructure(object):
                    if frag:
                        href = '#'.join((href, frag))
                    if not self.oeb.toc.has_href(href):
-                        text = u' '.join([t.strip() for t in \
-                                a.xpath('descendant::text()')])
+                        text = xml2text(a)
                        text = text[:100].strip()
                        if not self.oeb.toc.has_text(text):
                            num += 1
@ -135,7 +133,7 @@ class DetectStructure(object):


    def elem_to_link(self, item, elem, counter):
-        text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
+        text = xml2text(elem)
        text = text[:100].strip()
        id = elem.get('id', 'calibre_toc_%d'%counter)
        elem.set('id', id)