mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial (untested) port of splitting code to OEBBook
This commit is contained in:
parent
dc5299b8a8
commit
a423691dd5
@ -272,11 +272,26 @@ def XPath(expr):
|
|||||||
def xpath(elem, expr):
|
def xpath(elem, expr):
|
||||||
return elem.xpath(expr, namespaces=XPNSMAP)
|
return elem.xpath(expr, namespaces=XPNSMAP)
|
||||||
|
|
||||||
def xml2str(root, pretty_print=False):
|
def _prepare_xml_for_serialization(root):
|
||||||
return etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
root.set('xmlns', XHTML_NS)
|
||||||
|
root.set('{%s}xlink'%XHTML_NS, XLINK_NS)
|
||||||
|
for x in root.iter():
|
||||||
|
if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg':
|
||||||
|
x.set('xmlns', SVG_NS)
|
||||||
|
|
||||||
|
def xml2str(root, pretty_print=False, strip_comments=False):
|
||||||
|
_prepare_xml_for_serialization(root)
|
||||||
|
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||||
pretty_print=pretty_print)
|
pretty_print=pretty_print)
|
||||||
|
|
||||||
|
if strip_comments:
|
||||||
|
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def xml2unicode(root, pretty_print=False):
|
def xml2unicode(root, pretty_print=False):
|
||||||
|
_prepare_xml_for_serialization(root)
|
||||||
return etree.tostring(root, pretty_print=pretty_print)
|
return etree.tostring(root, pretty_print=pretty_print)
|
||||||
|
|
||||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||||
@ -826,6 +841,11 @@ class Manifest(object):
|
|||||||
return xml2str(data, pretty_print=self.oeb.pretty_print)
|
return xml2str(data, pretty_print=self.oeb.pretty_print)
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
return data.encode('utf-8')
|
return data.encode('utf-8')
|
||||||
|
if hasattr(data, 'cssText'):
|
||||||
|
data = data.cssText
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
return data
|
||||||
return str(data)
|
return str(data)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
@ -834,6 +854,8 @@ class Manifest(object):
|
|||||||
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
|
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
return data
|
return data
|
||||||
|
if hasattr(data, 'cssText'):
|
||||||
|
return data.cssText
|
||||||
return unicode(data)
|
return unicode(data)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
@ -1044,6 +1066,12 @@ class Spine(object):
|
|||||||
self.items[i].spine_position = i
|
self.items[i].spine_position = i
|
||||||
item.spine_position = None
|
item.spine_position = None
|
||||||
|
|
||||||
|
def index(self, item):
|
||||||
|
for i, x in enumerate(self):
|
||||||
|
if item == x:
|
||||||
|
return i
|
||||||
|
return -1
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
yield item
|
yield item
|
||||||
|
@ -162,7 +162,6 @@ class EbookIterator(object):
|
|||||||
s.pages = p
|
s.pages = p
|
||||||
start = 1
|
start = 1
|
||||||
|
|
||||||
|
|
||||||
for s in self.spine:
|
for s in self.spine:
|
||||||
s.start_page = start
|
s.start_page = start
|
||||||
start += s.pages
|
start += s.pages
|
||||||
|
@ -22,7 +22,6 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
if not os.path.exists(output_path):
|
if not os.path.exists(output_path):
|
||||||
os.makedirs(output_path)
|
os.makedirs(output_path)
|
||||||
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
|
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
|
||||||
from calibre.ebooks.html import tostring as html_tostring
|
|
||||||
with CurrentDir(output_path):
|
with CurrentDir(output_path):
|
||||||
results = oeb_book.to_opf2(page_map=True)
|
results = oeb_book.to_opf2(page_map=True)
|
||||||
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
||||||
@ -38,16 +37,7 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
dir = os.path.dirname(path)
|
dir = os.path.dirname(path)
|
||||||
if not os.path.exists(dir):
|
if not os.path.exists(dir):
|
||||||
os.makedirs(dir)
|
os.makedirs(dir)
|
||||||
raw = item.data
|
|
||||||
if not isinstance(raw, basestring):
|
|
||||||
if hasattr(raw, 'cssText'):
|
|
||||||
raw = raw.cssText
|
|
||||||
else:
|
|
||||||
raw = html_tostring(raw,
|
|
||||||
pretty_print=opts.pretty_print)
|
|
||||||
if isinstance(raw, unicode):
|
|
||||||
raw = raw.encode('utf-8')
|
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(raw)
|
f.write(str(item))
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,21 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Split the flows in an epub file to conform to size limitations.
|
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
|
||||||
|
forces at "likely" locations to conform to size limitations. This transform
|
||||||
|
assumes a prior call to the flatcss transform.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, math, functools, collections, re, copy, sys
|
import os, math, functools, collections, re, copy
|
||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
from lxml.cssselect import CSSSelector
|
from lxml.cssselect import CSSSelector
|
||||||
|
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \
|
||||||
|
rewrite_links
|
||||||
from calibre.ebooks.epub import tostring, rules
|
from calibre.ebooks.epub import tostring, rules
|
||||||
from calibre import CurrentDir
|
|
||||||
|
|
||||||
XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
|
NAMESPACES = dict(XPNSMAP)
|
||||||
content = functools.partial(os.path.join, 'content')
|
NAMESPACES['re'] = 'http://exslt.org/regular-expressions'
|
||||||
|
|
||||||
|
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||||
|
|
||||||
SPLIT_ATTR = 'cs'
|
SPLIT_ATTR = 'cs'
|
||||||
SPLIT_POINT_ATTR = 'csp'
|
SPLIT_POINT_ATTR = 'csp'
|
||||||
@ -27,149 +31,166 @@ class SplitError(ValueError):
|
|||||||
|
|
||||||
def __init__(self, path, root):
|
def __init__(self, path, root):
|
||||||
size = len(tostring(root))/1024.
|
size = len(tostring(root))/1024.
|
||||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
ValueError.__init__(self,
|
||||||
(os.path.basename(path), size))
|
_('Could not find reasonable point at which to split: '
|
||||||
|
'%s Sub-tree size: %d KB')%
|
||||||
|
(path, size))
|
||||||
|
|
||||||
|
class Split(object):
|
||||||
|
|
||||||
|
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
|
||||||
|
max_flow_size=0):
|
||||||
|
self.split_on_page_breaks = split_on_page_breaks
|
||||||
|
self.page_breaks_xpath = page_breaks_xpath
|
||||||
|
self.max_flow_size = max_flow_size
|
||||||
|
if self.page_breaks_xpath is not None:
|
||||||
|
self.page_breaks_xpath = XPath(self.page_breaks_xpath)
|
||||||
|
|
||||||
|
def __call__(self, oeb, context):
|
||||||
|
self.oeb = oeb
|
||||||
|
self.log = oeb.log
|
||||||
|
self.map = {}
|
||||||
|
self.page_break_selectors = None
|
||||||
|
for item in self.oeb.manifest.items:
|
||||||
|
if etree.iselement(item.data):
|
||||||
|
self.split_item(item)
|
||||||
|
|
||||||
|
self.fix_links()
|
||||||
|
|
||||||
|
def split_item(self, item):
|
||||||
|
if self.split_on_page_breaks:
|
||||||
|
if self.page_breaks_xpath is None:
|
||||||
|
page_breaks, page_break_ids = self.find_page_breaks(item)
|
||||||
|
else:
|
||||||
|
page_breaks, page_break_ids = self.page_breaks_xpath(item.data)
|
||||||
|
|
||||||
|
splitter = FlowSplitter(item, page_breaks, page_break_ids,
|
||||||
|
self.max_flow_size, self.oeb)
|
||||||
|
if splitter.was_split:
|
||||||
|
self.map[item.href] = dict(splitter.anchor_map)
|
||||||
|
|
||||||
|
def find_page_breaks(self, item):
|
||||||
|
if self.page_break_selectors is None:
|
||||||
|
self.page_break_selectors = set([])
|
||||||
|
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
||||||
|
OEB_STYLES]
|
||||||
|
page_break_selectors = set([])
|
||||||
|
for rule in rules(stylesheets):
|
||||||
|
before = getattr(rule.style.getPropertyCSSValue(
|
||||||
|
'page-break-before'), 'cssText', '').strip().lower()
|
||||||
|
after = getattr(rule.style.getPropertyCSSValue(
|
||||||
|
'page-break-after'), 'cssText', '').strip().lower()
|
||||||
|
try:
|
||||||
|
if before and before != 'avoid':
|
||||||
|
page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||||
|
True))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if after and after != 'avoid':
|
||||||
|
page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||||
|
False))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
page_breaks = set([])
|
||||||
|
for selector, before in page_break_selectors:
|
||||||
|
for elem in selector(item.data):
|
||||||
|
elem.pb_before = before
|
||||||
|
page_breaks.add(elem)
|
||||||
|
|
||||||
|
for i, elem in enumerate(item.data.iter()):
|
||||||
|
elem.pb_order = i
|
||||||
|
|
||||||
|
page_breaks = list(page_breaks)
|
||||||
|
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
||||||
|
page_break_ids, page_breaks_ = [], []
|
||||||
|
for i, x in enumerate(page_breaks):
|
||||||
|
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||||
|
id = x.get('id')
|
||||||
|
page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
||||||
|
page_break_ids.append(id)
|
||||||
|
|
||||||
|
return page_breaks_, page_break_ids
|
||||||
|
|
||||||
|
def fix_links(self, opf):
|
||||||
|
'''
|
||||||
|
Fix references to the split files in other content files.
|
||||||
|
'''
|
||||||
|
for item in self.oeb.manifest:
|
||||||
|
if etree.iselement(item.data):
|
||||||
|
self.current_item = item
|
||||||
|
rewrite_links(item.data, self.rewrite_links)
|
||||||
|
|
||||||
|
def rewrite_links(self, url):
|
||||||
|
href, frag = urldefrag(url)
|
||||||
|
href = self.current_item.abshref(href)
|
||||||
|
if href in self.map:
|
||||||
|
anchor_map = self.map[href]
|
||||||
|
nhref = anchor_map[frag if frag else None]
|
||||||
|
if frag:
|
||||||
|
nhref = '#'.joinn(href, frag)
|
||||||
|
return nhref
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Splitter(object):
|
class FlowSplitter(object):
|
||||||
|
|
||||||
def __init__(self, path, opts, stylesheet_map, opf):
|
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb):
|
||||||
self.setup_cli_handler(opts.verbose)
|
self.item = item
|
||||||
self.path = path
|
self.oeb = oeb
|
||||||
self.always_remove = not opts.preserve_tag_structure or \
|
self.log = oeb.log
|
||||||
os.stat(content(path)).st_size > 5*opts.profile.flow_size
|
self.page_breaks = page_breaks
|
||||||
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html')
|
self.page_break_ids = page_break_ids
|
||||||
self.opts = opts
|
self.max_flow_size = max_flow_size
|
||||||
self.orig_size = os.stat(content(path)).st_size
|
self.base = item.abshref(item.href)
|
||||||
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
|
||||||
root = html.fromstring(open(content(path)).read())
|
|
||||||
|
|
||||||
self.page_breaks, self.trees = [], []
|
base, ext = os.path.splitext(self.base)
|
||||||
self.split_size = 0
|
self.base = base.replace('%', '%%')+'_split_%d'+ext
|
||||||
|
|
||||||
# Split on page breaks
|
self.trees = [self.item.data]
|
||||||
self.splitting_on_page_breaks = True
|
self.splitting_on_page_breaks = True
|
||||||
if not opts.dont_split_on_page_breaks:
|
if self.page_breaks:
|
||||||
self.log_info('\tSplitting on page breaks...')
|
self.split_on_page_breaks(self.item.data)
|
||||||
if self.path in stylesheet_map:
|
|
||||||
self.find_page_breaks(stylesheet_map[self.path], root)
|
|
||||||
self.split_on_page_breaks(root.getroottree())
|
|
||||||
trees = list(self.trees)
|
|
||||||
else:
|
|
||||||
self.trees = [root.getroottree()]
|
|
||||||
trees = list(self.trees)
|
|
||||||
|
|
||||||
# Split any remaining over-sized trees
|
|
||||||
self.splitting_on_page_breaks = False
|
self.splitting_on_page_breaks = False
|
||||||
if self.opts.profile.flow_size < sys.maxint:
|
|
||||||
|
if self.max_flow_size > 0:
|
||||||
lt_found = False
|
lt_found = False
|
||||||
self.log_info('\tLooking for large trees...')
|
self.log('\tLooking for large trees...')
|
||||||
for i, tree in enumerate(list(trees)):
|
trees = list(self.trees)
|
||||||
|
for i, tree in enumerate(list(self.trees)):
|
||||||
self.trees = []
|
self.trees = []
|
||||||
size = len(tostring(tree.getroot()))
|
size = len(tostring(tree.getroot()))
|
||||||
if size > self.opts.profile.flow_size:
|
if size > self.opts.profile.flow_size:
|
||||||
lt_found = True
|
lt_found = True
|
||||||
try:
|
self.split_to_size(tree)
|
||||||
self.split_to_size(tree)
|
|
||||||
except (SplitError, RuntimeError): # Splitting fails
|
|
||||||
if not self.always_remove:
|
|
||||||
self.always_remove = True
|
|
||||||
self.split_to_size(tree)
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
trees[i:i+1] = list(self.trees)
|
trees[i:i+1] = list(self.trees)
|
||||||
if not lt_found:
|
if not lt_found:
|
||||||
self.log_info('\tNo large trees found')
|
self.log_info('\tNo large trees found')
|
||||||
|
self.trees = trees
|
||||||
|
|
||||||
self.trees = trees
|
|
||||||
self.was_split = len(self.trees) > 1
|
self.was_split = len(self.trees) > 1
|
||||||
if self.was_split:
|
self.commit()
|
||||||
self.commit()
|
|
||||||
self.log_info('\t\tSplit into %d parts.', len(self.trees))
|
|
||||||
if self.opts.verbose:
|
|
||||||
for f in self.files:
|
|
||||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
|
||||||
self.fix_opf(opf)
|
|
||||||
|
|
||||||
self.trees = None
|
def split_on_page_breaks(self, orig_tree):
|
||||||
|
ordered_ids = []
|
||||||
|
for elem in orig_tree.xpath('//*[@id]'):
|
||||||
|
id = elem.get('id')
|
||||||
|
if id in self.page_break_ids:
|
||||||
|
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
||||||
|
|
||||||
|
self.trees = []
|
||||||
def split_text(self, text, root, size):
|
tree = orig_tree
|
||||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
for pattern, before in ordered_ids:
|
||||||
rest = text.replace('\r', '')
|
self.log.debug('\t\tSplitting on page-break')
|
||||||
parts = re.split('\n\n', rest)
|
elem = pattern(tree)
|
||||||
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
if elem:
|
||||||
if max(map(len, parts)) > size:
|
before, after = self.do_split(tree, elem[0], before)
|
||||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
self.trees.append(before)
|
||||||
ans = []
|
tree = after
|
||||||
buf = ''
|
self.trees.append(tree)
|
||||||
for part in parts:
|
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
||||||
if len(buf) + len(part) < size:
|
|
||||||
buf += '\n\n'+part
|
|
||||||
else:
|
|
||||||
ans.append(buf)
|
|
||||||
buf = part
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
def split_to_size(self, tree):
|
|
||||||
self.log_debug('\t\tSplitting...')
|
|
||||||
root = tree.getroot()
|
|
||||||
# Split large <pre> tags
|
|
||||||
for pre in list(root.xpath('//pre')):
|
|
||||||
text = u''.join(pre.xpath('descendant::text()'))
|
|
||||||
pre.text = text
|
|
||||||
for child in list(pre.iterchildren()):
|
|
||||||
pre.remove(child)
|
|
||||||
if len(pre.text) > self.opts.profile.flow_size*0.5:
|
|
||||||
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
|
|
||||||
new_pres = []
|
|
||||||
for frag in frags:
|
|
||||||
pre2 = copy.copy(pre)
|
|
||||||
pre2.text = frag
|
|
||||||
pre2.tail = u''
|
|
||||||
new_pres.append(pre2)
|
|
||||||
new_pres[-1].tail = pre.tail
|
|
||||||
p = pre.getparent()
|
|
||||||
i = p.index(pre)
|
|
||||||
p[i:i+1] = new_pres
|
|
||||||
|
|
||||||
split_point, before = self.find_split_point(root)
|
|
||||||
if split_point is None or self.split_size > 6*self.orig_size:
|
|
||||||
if not self.always_remove:
|
|
||||||
self.log_warn(_('\t\tToo much markup. Re-splitting without '
|
|
||||||
'structure preservation. This may cause '
|
|
||||||
'incorrect rendering.'))
|
|
||||||
raise SplitError(self.path, root)
|
|
||||||
|
|
||||||
for t in self.do_split(tree, split_point, before):
|
|
||||||
r = t.getroot()
|
|
||||||
if self.is_page_empty(r):
|
|
||||||
continue
|
|
||||||
size = len(tostring(r))
|
|
||||||
if size <= self.opts.profile.flow_size:
|
|
||||||
self.trees.append(t)
|
|
||||||
#print tostring(t.getroot(), pretty_print=True)
|
|
||||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
|
||||||
len(self.trees), size/1024.)
|
|
||||||
self.split_size += size
|
|
||||||
else:
|
|
||||||
self.split_to_size(t)
|
|
||||||
|
|
||||||
def is_page_empty(self, root):
|
|
||||||
body = root.find('body')
|
|
||||||
if body is None:
|
|
||||||
return False
|
|
||||||
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
|
|
||||||
if len(txt) > 4:
|
|
||||||
#if len(txt) < 100:
|
|
||||||
# print 1111111, html.tostring(body, method='html', encoding=unicode)
|
|
||||||
return False
|
|
||||||
for img in root.xpath('//img'):
|
|
||||||
if img.get('style', '') != 'display:none':
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def do_split(self, tree, split_point, before):
|
def do_split(self, tree, split_point, before):
|
||||||
'''
|
'''
|
||||||
@ -190,7 +211,7 @@ class Splitter(object):
|
|||||||
split_point2 = root2.xpath(path)[0]
|
split_point2 = root2.xpath(path)[0]
|
||||||
|
|
||||||
def nix_element(elem, top=True):
|
def nix_element(elem, top=True):
|
||||||
if self.always_remove:
|
if True:
|
||||||
parent = elem.getparent()
|
parent = elem.getparent()
|
||||||
index = parent.index(elem)
|
index = parent.index(elem)
|
||||||
if top:
|
if top:
|
||||||
@ -198,7 +219,6 @@ class Splitter(object):
|
|||||||
else:
|
else:
|
||||||
index = parent.index(elem)
|
index = parent.index(elem)
|
||||||
parent[index:index+1] = list(elem.iterchildren())
|
parent[index:index+1] = list(elem.iterchildren())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
elem.text = u''
|
elem.text = u''
|
||||||
elem.tail = u''
|
elem.tail = u''
|
||||||
@ -241,67 +261,76 @@ class Splitter(object):
|
|||||||
|
|
||||||
return tree, tree2
|
return tree, tree2
|
||||||
|
|
||||||
|
def is_page_empty(self, root):
|
||||||
|
body = root.find('body')
|
||||||
|
if body is None:
|
||||||
|
return False
|
||||||
|
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
|
||||||
|
if len(txt) > 4:
|
||||||
|
return False
|
||||||
|
for img in root.xpath('//img'):
|
||||||
|
if img.get('style', '') != 'display:none':
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def split_on_page_breaks(self, orig_tree):
|
def split_text(self, text, root, size):
|
||||||
ordered_ids = []
|
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||||
for elem in orig_tree.xpath('//*[@id]'):
|
rest = text.replace('\r', '')
|
||||||
id = elem.get('id')
|
parts = re.split('\n\n', rest)
|
||||||
if id in self.page_break_ids:
|
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
|
||||||
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
if max(map(len, parts)) > size:
|
||||||
|
raise SplitError('Cannot split as file contains a <pre> tag '
|
||||||
self.trees = []
|
'with a very large paragraph', root)
|
||||||
tree = orig_tree
|
ans = []
|
||||||
for pattern, before in ordered_ids:
|
buf = ''
|
||||||
self.log_info('\t\tSplitting on page-break')
|
for part in parts:
|
||||||
elem = pattern(tree)
|
if len(buf) + len(part) < size:
|
||||||
if elem:
|
buf += '\n\n'+part
|
||||||
before, after = self.do_split(tree, elem[0], before)
|
else:
|
||||||
self.trees.append(before)
|
ans.append(buf)
|
||||||
tree = after
|
buf = part
|
||||||
self.trees.append(tree)
|
return ans
|
||||||
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
|
||||||
|
|
||||||
|
|
||||||
|
def split_to_size(self, tree):
|
||||||
|
self.log.debug('\t\tSplitting...')
|
||||||
|
root = tree.getroot()
|
||||||
|
# Split large <pre> tags
|
||||||
|
for pre in list(root.xpath('//pre')):
|
||||||
|
text = u''.join(pre.xpath('descendant::text()'))
|
||||||
|
pre.text = text
|
||||||
|
for child in list(pre.iterchildren()):
|
||||||
|
pre.remove(child)
|
||||||
|
if len(pre.text) > self.max_flow_size*0.5:
|
||||||
|
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
|
||||||
|
new_pres = []
|
||||||
|
for frag in frags:
|
||||||
|
pre2 = copy.copy(pre)
|
||||||
|
pre2.text = frag
|
||||||
|
pre2.tail = u''
|
||||||
|
new_pres.append(pre2)
|
||||||
|
new_pres[-1].tail = pre.tail
|
||||||
|
p = pre.getparent()
|
||||||
|
i = p.index(pre)
|
||||||
|
p[i:i+1] = new_pres
|
||||||
|
|
||||||
def find_page_breaks(self, stylesheets, root):
|
split_point, before = self.find_split_point(root)
|
||||||
'''
|
if split_point is None:
|
||||||
Find all elements that have either page-break-before or page-break-after set.
|
raise SplitError(self.item.href, root)
|
||||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
|
||||||
have ids, an id is created).
|
|
||||||
'''
|
|
||||||
page_break_selectors = set([])
|
|
||||||
for rule in rules(stylesheets):
|
|
||||||
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
|
|
||||||
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
|
|
||||||
try:
|
|
||||||
if before and before != 'avoid':
|
|
||||||
page_break_selectors.add((CSSSelector(rule.selectorText), True))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
if after and after != 'avoid':
|
|
||||||
page_break_selectors.add((CSSSelector(rule.selectorText), False))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page_breaks = set([])
|
|
||||||
for selector, before in page_break_selectors:
|
|
||||||
for elem in selector(root):
|
|
||||||
elem.pb_before = before
|
|
||||||
page_breaks.add(elem)
|
|
||||||
|
|
||||||
for i, elem in enumerate(root.iter()):
|
|
||||||
elem.pb_order = i
|
|
||||||
|
|
||||||
page_breaks = list(page_breaks)
|
|
||||||
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
|
||||||
self.page_break_ids = []
|
|
||||||
for i, x in enumerate(page_breaks):
|
|
||||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
|
||||||
id = x.get('id')
|
|
||||||
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
|
||||||
self.page_break_ids.append(id)
|
|
||||||
|
|
||||||
|
for t in self.do_split(tree, split_point, before):
|
||||||
|
r = t.getroot()
|
||||||
|
if self.is_page_empty(r):
|
||||||
|
continue
|
||||||
|
size = len(tostring(r))
|
||||||
|
if size <= self.max_flow_size:
|
||||||
|
self.trees.append(t)
|
||||||
|
#print tostring(t.getroot(), pretty_print=True)
|
||||||
|
self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||||
|
len(self.trees), size/1024.)
|
||||||
|
self.split_size += size
|
||||||
|
else:
|
||||||
|
self.split_to_size(t)
|
||||||
|
|
||||||
def find_split_point(self, root):
|
def find_split_point(self, root):
|
||||||
'''
|
'''
|
||||||
@ -336,8 +365,7 @@ class Splitter(object):
|
|||||||
'//br',
|
'//br',
|
||||||
'//li',
|
'//li',
|
||||||
):
|
):
|
||||||
elems = root.xpath(path,
|
elems = root.xpath(path, namespaces=NAMESPACES)
|
||||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
|
||||||
elem = pick_elem(elems)
|
elem = pick_elem(elems)
|
||||||
if elem is not None:
|
if elem is not None:
|
||||||
try:
|
try:
|
||||||
@ -355,6 +383,8 @@ class Splitter(object):
|
|||||||
all anchors in the original tree. Internal links are re-directed. The
|
all anchors in the original tree. Internal links are re-directed. The
|
||||||
original file is deleted and the split files are saved.
|
original file is deleted and the split files are saved.
|
||||||
'''
|
'''
|
||||||
|
if not self.was_split:
|
||||||
|
return
|
||||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
||||||
self.files = []
|
self.files = []
|
||||||
|
|
||||||
@ -368,134 +398,46 @@ class Splitter(object):
|
|||||||
elem.attrib.pop(SPLIT_ATTR, None)
|
elem.attrib.pop(SPLIT_ATTR, None)
|
||||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||||
|
|
||||||
for current, tree in zip(self.files, self.trees):
|
spine_pos = self.item.spine_pos
|
||||||
for a in tree.getroot().xpath('//a[@href]'):
|
for current, tree in zip(map(reversed, (self.files, self.trees))):
|
||||||
|
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
|
||||||
href = a.get('href').strip()
|
href = a.get('href').strip()
|
||||||
if href.startswith('#'):
|
if href.startswith('#'):
|
||||||
anchor = href[1:]
|
anchor = href[1:]
|
||||||
file = self.anchor_map[anchor]
|
file = self.anchor_map[anchor]
|
||||||
if file != current:
|
if file != current:
|
||||||
a.set('href', file+href)
|
a.set('href', file+href)
|
||||||
open(content(current), 'wb').\
|
|
||||||
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
|
|
||||||
|
|
||||||
os.remove(content(self.path))
|
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
|
||||||
|
new_item = self.oeb.manifest.add(new_id, current,
|
||||||
|
self.item.media_type, data=tree.getroot())
|
||||||
|
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
||||||
|
|
||||||
|
if self.oeb.guide:
|
||||||
|
for ref in self.oeb.guide:
|
||||||
|
href, frag = urldefrag(ref.href)
|
||||||
|
if href == self.item.href:
|
||||||
|
nhref = self.anchor_map[frag if frag else None]
|
||||||
|
if frag:
|
||||||
|
nhref = '#'.join(nhref, frag)
|
||||||
|
ref.href = nhref
|
||||||
|
|
||||||
|
def fix_toc_entry(toc):
|
||||||
|
if toc.href:
|
||||||
|
href, frag = urldefrag(toc.href)
|
||||||
|
if href == self.item.href:
|
||||||
|
nhref = self.anchor_map[frag if frag else None]
|
||||||
|
if frag:
|
||||||
|
nhref = '#'.join(nhref, frag)
|
||||||
|
toc.href = nhref
|
||||||
|
for x in toc:
|
||||||
|
fix_toc_entry(x)
|
||||||
|
|
||||||
|
|
||||||
def fix_opf(self, opf):
|
if self.oeb.toc:
|
||||||
'''
|
fix_toc_entry(self.oeb.toc)
|
||||||
Fix references to the split file in the OPF.
|
|
||||||
'''
|
|
||||||
items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
|
|
||||||
new_items = [('content/'+f, None) for f in self.files]
|
|
||||||
id_map = {}
|
|
||||||
for item in items:
|
|
||||||
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
|
|
||||||
|
|
||||||
for id in id_map.keys():
|
self.oeb.manifest.remove(self.item)
|
||||||
opf.replace_spine_items_by_idref(id, id_map[id])
|
|
||||||
|
|
||||||
for ref in opf.iterguide():
|
|
||||||
href = ref.get('href', '')
|
|
||||||
if href.startswith('content/'+self.path):
|
|
||||||
href = href.split('#')
|
|
||||||
frag = None
|
|
||||||
if len(href) > 1:
|
|
||||||
frag = href[1]
|
|
||||||
if frag not in self.anchor_map:
|
|
||||||
self.log_warning('\t\tUnable to re-map OPF link', href)
|
|
||||||
continue
|
|
||||||
new_file = self.anchor_map[frag]
|
|
||||||
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fix_content_links(html_files, changes, opts):
|
|
||||||
split_files = [f.path for f in changes]
|
|
||||||
anchor_maps = [f.anchor_map for f in changes]
|
|
||||||
files = list(html_files)
|
|
||||||
for j, f in enumerate(split_files):
|
|
||||||
try:
|
|
||||||
i = files.index(f)
|
|
||||||
files[i:i+1] = changes[j].files
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for htmlfile in files:
|
|
||||||
changed = False
|
|
||||||
root = html.fromstring(open(content(htmlfile), 'rb').read())
|
|
||||||
for a in root.xpath('//a[@href]'):
|
|
||||||
href = a.get('href')
|
|
||||||
if not href.startswith('#'):
|
|
||||||
href = href.split('#')
|
|
||||||
anchor = href[1] if len(href) > 1 else None
|
|
||||||
href = href[0]
|
|
||||||
if href in split_files:
|
|
||||||
try:
|
|
||||||
newf = anchor_maps[split_files.index(href)][anchor]
|
|
||||||
except:
|
|
||||||
print '\t\tUnable to remap HTML link:', href, anchor
|
|
||||||
continue
|
|
||||||
frag = ('#'+anchor) if anchor else ''
|
|
||||||
a.set('href', newf+frag)
|
|
||||||
changed = True
|
|
||||||
|
|
||||||
if changed:
|
|
||||||
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
|
||||||
|
|
||||||
def fix_ncx(path, changes):
|
|
||||||
split_files = [f.path for f in changes]
|
|
||||||
anchor_maps = [f.anchor_map for f in changes]
|
|
||||||
tree = etree.parse(path)
|
|
||||||
changed = False
|
|
||||||
for content in tree.getroot().xpath('//x:content[@src]',
|
|
||||||
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
|
||||||
href = content.get('src')
|
|
||||||
if not href.startswith('#'):
|
|
||||||
href = href.split('#')
|
|
||||||
anchor = href[1] if len(href) > 1 else None
|
|
||||||
href = href[0].split('/')[-1]
|
|
||||||
if href in split_files:
|
|
||||||
try:
|
|
||||||
newf = anchor_maps[split_files.index(href)][anchor]
|
|
||||||
except:
|
|
||||||
print 'Unable to remap NCX link:', href, anchor
|
|
||||||
frag = ('#'+anchor) if anchor else ''
|
|
||||||
content.set('src', 'content/'+newf+frag)
|
|
||||||
changed = True
|
|
||||||
if changed:
|
|
||||||
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
|
|
||||||
|
|
||||||
def find_html_files(opf):
|
|
||||||
'''
|
|
||||||
Find all HTML files referenced by `opf`.
|
|
||||||
'''
|
|
||||||
html_files = []
|
|
||||||
for item in opf.itermanifest():
|
|
||||||
if 'html' in item.get('media-type', '').lower():
|
|
||||||
f = item.get('href').split('/')[-1]
|
|
||||||
f2 = f.replace('&', '%26')
|
|
||||||
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
|
|
||||||
f = f2
|
|
||||||
item.set('href', item.get('href').replace('&', '%26'))
|
|
||||||
if os.path.exists(content(f)):
|
|
||||||
html_files.append(f)
|
|
||||||
return html_files
|
|
||||||
|
|
||||||
|
|
||||||
def split(pathtoopf, opts, stylesheet_map):
|
|
||||||
pathtoopf = os.path.abspath(pathtoopf)
|
|
||||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
|
||||||
|
|
||||||
with CurrentDir(os.path.dirname(pathtoopf)):
|
|
||||||
html_files = find_html_files(opf)
|
|
||||||
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
|
||||||
changes = [c for c in changes if c.was_split]
|
|
||||||
|
|
||||||
fix_content_links(html_files, changes, opts)
|
|
||||||
for item in opf.itermanifest():
|
|
||||||
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
|
||||||
fix_ncx(item.get('href'), changes)
|
|
||||||
break
|
|
||||||
|
|
||||||
open(pathtoopf, 'wb').write(opf.render())
|
|
Loading…
x
Reference in New Issue
Block a user