Conversion pipeline: More robust conversion of tags to text when detecting structure

This commit is contained in:
Kovid Goyal 2010-05-02 14:47:52 -06:00
parent 8dc171ee3a
commit 3df90b8926
2 changed files with 7 additions and 6 deletions

View File

@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False): def xml2unicode(root, pretty_print=False):
return etree.tostring(root, pretty_print=pretty_print) return etree.tostring(root, pretty_print=pretty_print)
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

View File

@ -11,7 +11,7 @@ import re
from lxml import etree from lxml import etree
from urlparse import urlparse from urlparse import urlparse
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text
from calibre.ebooks import ConversionError from calibre.ebooks import ConversionError
def XPath(x): def XPath(x):
@ -79,8 +79,7 @@ class DetectStructure(object):
page_break_before = 'display: block; page-break-before: always' page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always' page_break_after = 'display: block; page-break-after: always'
for item, elem in self.detected_chapters: for item, elem in self.detected_chapters:
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) text = xml2text(elem).strip()
text = text.strip()
self.log('\tDetected chapter:', text[:50]) self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none': if chapter_mark == 'none':
continue continue
@ -120,8 +119,7 @@ class DetectStructure(object):
if frag: if frag:
href = '#'.join((href, frag)) href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href): if not self.oeb.toc.has_href(href):
text = u' '.join([t.strip() for t in \ text = xml2text(a)
a.xpath('descendant::text()')])
text = text[:100].strip() text = text[:100].strip()
if not self.oeb.toc.has_text(text): if not self.oeb.toc.has_text(text):
num += 1 num += 1
@ -135,7 +133,7 @@ class DetectStructure(object):
def elem_to_link(self, item, elem, counter): def elem_to_link(self, item, elem, counter):
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) text = xml2text(elem)
text = text[:100].strip() text = text[:100].strip()
id = elem.get('id', 'calibre_toc_%d'%counter) id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id) elem.set('id', id)