mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: More robust conversion of tags to text when detecting structure
This commit is contained in:
parent
8dc171ee3a
commit
3df90b8926
@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False):
|
||||
def xml2unicode(root, pretty_print=False):
|
||||
return etree.tostring(root, pretty_print=pretty_print)
|
||||
|
||||
def xml2text(elem):
|
||||
return etree.tostring(elem, method='text', encoding=unicode)
|
||||
|
||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
||||
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
|
@ -11,7 +11,7 @@ import re
|
||||
from lxml import etree
|
||||
from urlparse import urlparse
|
||||
|
||||
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
|
||||
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text
|
||||
from calibre.ebooks import ConversionError
|
||||
|
||||
def XPath(x):
|
||||
@ -79,8 +79,7 @@ class DetectStructure(object):
|
||||
page_break_before = 'display: block; page-break-before: always'
|
||||
page_break_after = 'display: block; page-break-after: always'
|
||||
for item, elem in self.detected_chapters:
|
||||
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
|
||||
text = text.strip()
|
||||
text = xml2text(elem).strip()
|
||||
self.log('\tDetected chapter:', text[:50])
|
||||
if chapter_mark == 'none':
|
||||
continue
|
||||
@ -120,8 +119,7 @@ class DetectStructure(object):
|
||||
if frag:
|
||||
href = '#'.join((href, frag))
|
||||
if not self.oeb.toc.has_href(href):
|
||||
text = u' '.join([t.strip() for t in \
|
||||
a.xpath('descendant::text()')])
|
||||
text = xml2text(a)
|
||||
text = text[:100].strip()
|
||||
if not self.oeb.toc.has_text(text):
|
||||
num += 1
|
||||
@ -135,7 +133,7 @@ class DetectStructure(object):
|
||||
|
||||
|
||||
def elem_to_link(self, item, elem, counter):
|
||||
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
|
||||
text = xml2text(elem)
|
||||
text = text[:100].strip()
|
||||
id = elem.get('id', 'calibre_toc_%d'%counter)
|
||||
elem.set('id', id)
|
||||
|
Loading…
x
Reference in New Issue
Block a user