Pull from trunk

This commit is contained in:
Kovid Goyal 2010-05-02 15:17:32 -06:00
commit 536eb02887
3 changed files with 17 additions and 10 deletions

View File

@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False): def xml2unicode(root, pretty_print=False):
return etree.tostring(root, pretty_print=pretty_print) return etree.tostring(root, pretty_print=pretty_print)
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

View File

@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en'
''' '''
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forces at "likely" locations to conform to size limitations. This transform forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform. assumes a prior call to the flatcss transform.
''' '''
@ -385,12 +385,18 @@ class FlowSplitter(object):
raise SplitError(self.item.href, root) raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point)) self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
for t in self.do_split(tree, split_point, before): trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees]
if min(sizes) < 5*1024:
self.log.debug('\t\t\tSplit tree too small')
self.split_to_size(tree)
return
for t, size in zip(trees, sizes):
r = t.getroot() r = t.getroot()
if self.is_page_empty(r): if self.is_page_empty(r):
continue continue
size = len(tostring(r)) elif size <= self.max_flow_size:
if size <= self.max_flow_size:
self.split_trees.append(t) self.split_trees.append(t)
self.log.debug( self.log.debug(
'\t\t\tCommitted sub-tree #%d (%d KB)'%( '\t\t\tCommitted sub-tree #%d (%d KB)'%(

View File

@ -11,7 +11,7 @@ import re
from lxml import etree from lxml import etree
from urlparse import urlparse from urlparse import urlparse
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text
from calibre.ebooks import ConversionError from calibre.ebooks import ConversionError
def XPath(x): def XPath(x):
@ -79,8 +79,7 @@ class DetectStructure(object):
page_break_before = 'display: block; page-break-before: always' page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always' page_break_after = 'display: block; page-break-after: always'
for item, elem in self.detected_chapters: for item, elem in self.detected_chapters:
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) text = xml2text(elem).strip()
text = text.strip()
self.log('\tDetected chapter:', text[:50]) self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none': if chapter_mark == 'none':
continue continue
@ -120,8 +119,7 @@ class DetectStructure(object):
if frag: if frag:
href = '#'.join((href, frag)) href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href): if not self.oeb.toc.has_href(href):
text = u' '.join([t.strip() for t in \ text = xml2text(a)
a.xpath('descendant::text()')])
text = text[:100].strip() text = text[:100].strip()
if not self.oeb.toc.has_text(text): if not self.oeb.toc.has_text(text):
num += 1 num += 1
@ -135,7 +133,7 @@ class DetectStructure(object):
def elem_to_link(self, item, elem, counter): def elem_to_link(self, item, elem, counter):
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) text = xml2text(elem)
text = text[:100].strip() text = text[:100].strip()
id = elem.get('id', 'calibre_toc_%d'%counter) id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id) elem.set('id', id)