Pull from trunk

This commit is contained in:
Kovid Goyal 2010-05-02 15:17:32 -06:00
commit 536eb02887
3 changed files with 17 additions and 10 deletions

View File

@ -294,6 +294,9 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False):
return etree.tostring(root, pretty_print=pretty_print)
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

View File

@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en'
'''
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forces at "likely" locations to conform to size limitations. This transform
forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
'''
@ -385,12 +385,18 @@ class FlowSplitter(object):
raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
for t in self.do_split(tree, split_point, before):
trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees]
if min(sizes) < 5*1024:
self.log.debug('\t\t\tSplit tree too small')
self.split_to_size(tree)
return
for t, size in zip(trees, sizes):
r = t.getroot()
if self.is_page_empty(r):
continue
size = len(tostring(r))
if size <= self.max_flow_size:
elif size <= self.max_flow_size:
self.split_trees.append(t)
self.log.debug(
'\t\t\tCommitted sub-tree #%d (%d KB)'%(

View File

@ -11,7 +11,7 @@ import re
from lxml import etree
from urlparse import urlparse
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text
from calibre.ebooks import ConversionError
def XPath(x):
@ -79,8 +79,7 @@ class DetectStructure(object):
page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always'
for item, elem in self.detected_chapters:
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
text = text.strip()
text = xml2text(elem).strip()
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
@ -120,8 +119,7 @@ class DetectStructure(object):
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = u' '.join([t.strip() for t in \
a.xpath('descendant::text()')])
text = xml2text(a)
text = text[:100].strip()
if not self.oeb.toc.has_text(text):
num += 1
@ -135,7 +133,7 @@ class DetectStructure(object):
def elem_to_link(self, item, elem, counter):
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
text = xml2text(elem)
text = text[:100].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id)