mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
EPUB Output: Change the splitting logic. Now calibre will first split on *all* page breaks and then split any remaining large chunks. This *may* have caused regressions, so be careful.
This commit is contained in:
parent
c9e86cc1e9
commit
f152f37ec0
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Split the flows in an epub file to conform to size limitations.
|
Split the flows in an epub file to conform to size limitations.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, math, logging, functools, collections, re, copy
|
import os, math, logging, functools, collections, re, copy, sys
|
||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
@ -34,29 +34,61 @@ class SplitError(ValueError):
|
|||||||
|
|
||||||
class Splitter(LoggingInterface):
|
class Splitter(LoggingInterface):
|
||||||
|
|
||||||
def __init__(self, path, opts, stylesheet_map, always_remove=False):
|
def __init__(self, path, opts, stylesheet_map, opf):
|
||||||
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
|
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
|
||||||
self.setup_cli_handler(opts.verbose)
|
self.setup_cli_handler(opts.verbose)
|
||||||
self.path = path
|
self.path = path
|
||||||
self.always_remove = always_remove
|
self.always_remove = not opts.preserve_tag_structure or \
|
||||||
|
os.stat(content(path)).st_size > 5*opts.profile.flow_size
|
||||||
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html')
|
self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html')
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.orig_size = os.stat(content(path)).st_size
|
self.orig_size = os.stat(content(path)).st_size
|
||||||
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
||||||
root = html.fromstring(open(content(path)).read())
|
root = html.fromstring(open(content(path)).read())
|
||||||
|
|
||||||
self.page_breaks = []
|
self.page_breaks, self.trees = [], []
|
||||||
self.find_page_breaks(stylesheet_map[self.path], root)
|
|
||||||
|
|
||||||
self.trees = []
|
|
||||||
self.split_size = 0
|
self.split_size = 0
|
||||||
self.split(root.getroottree())
|
|
||||||
self.commit()
|
# Split on page breaks
|
||||||
self.log_info('\t\tSplit into %d parts.', len(self.trees))
|
self.log_info('\tSplitting on page breaks...')
|
||||||
if self.opts.verbose:
|
if self.path in stylesheet_map:
|
||||||
for f in self.files:
|
self.find_page_breaks(stylesheet_map[self.path], root)
|
||||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
self.split_on_page_breaks(root.getroottree())
|
||||||
|
trees = list(self.trees)
|
||||||
|
|
||||||
|
# Split any remaining over-sized trees
|
||||||
|
if self.opts.profile.flow_size < sys.maxint:
|
||||||
|
lt_found = False
|
||||||
|
self.log_info('\tLooking for large trees...')
|
||||||
|
for i, tree in enumerate(list(trees)):
|
||||||
|
self.trees = []
|
||||||
|
size = len(tostring(tree.getroot()))
|
||||||
|
if size > self.opts.profile.flow_size:
|
||||||
|
lt_found = True
|
||||||
|
try:
|
||||||
|
self.split_to_size(tree)
|
||||||
|
except (SplitError, RuntimeError): # Splitting fails
|
||||||
|
if not self.always_remove:
|
||||||
|
self.always_remove = True
|
||||||
|
self.split_to_size(tree)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
trees[i:i+1] = list(self.trees)
|
||||||
|
if not lt_found:
|
||||||
|
self.log_info('\tNo large trees found')
|
||||||
|
|
||||||
|
self.trees = trees
|
||||||
|
self.was_split = len(self.trees) > 1
|
||||||
|
if self.was_split:
|
||||||
|
self.commit()
|
||||||
|
self.log_info('\t\tSplit into %d parts.', len(self.trees))
|
||||||
|
if self.opts.verbose:
|
||||||
|
for f in self.files:
|
||||||
|
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||||
|
self.fix_opf(opf)
|
||||||
|
|
||||||
self.trees = None
|
self.trees = None
|
||||||
|
|
||||||
|
|
||||||
def split_text(self, text, root, size):
|
def split_text(self, text, root, size):
|
||||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||||
@ -76,12 +108,7 @@ class Splitter(LoggingInterface):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def split(self, tree):
|
def split_to_size(self, tree):
|
||||||
'''
|
|
||||||
Split ``tree`` into a *before* and *after* tree, preserving tag structure,
|
|
||||||
but not duplicating any text. All tags that have had their text and tail
|
|
||||||
removed have the attribute ``calibre_split`` set to 1.
|
|
||||||
'''
|
|
||||||
self.log_debug('\t\tSplitting...')
|
self.log_debug('\t\tSplitting...')
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
# Split large <pre> tags
|
# Split large <pre> tags
|
||||||
@ -108,10 +135,48 @@ class Splitter(LoggingInterface):
|
|||||||
if not self.always_remove:
|
if not self.always_remove:
|
||||||
self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
|
self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
|
||||||
raise SplitError(self.path, root)
|
raise SplitError(self.path, root)
|
||||||
tree2 = copy.deepcopy(tree)
|
|
||||||
root2 = tree2.getroot()
|
for t in self.do_split(tree, split_point, before):
|
||||||
body, body2 = root.body, root2.body
|
r = t.getroot()
|
||||||
path = tree.getpath(split_point)
|
size = len(tostring(r))
|
||||||
|
if size <= self.opts.profile.flow_size:
|
||||||
|
self.trees.append(t)
|
||||||
|
#print tostring(t.getroot(), pretty_print=True)
|
||||||
|
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.)
|
||||||
|
self.split_size += size
|
||||||
|
else:
|
||||||
|
self.split_to_size(t)
|
||||||
|
|
||||||
|
def is_page_empty(self, root):
|
||||||
|
body = root.find('body')
|
||||||
|
if body is None:
|
||||||
|
return False
|
||||||
|
txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
|
||||||
|
if len(txt) > 4:
|
||||||
|
#if len(txt) < 100:
|
||||||
|
# print 1111111, html.tostring(body, method='html', encoding=unicode)
|
||||||
|
return False
|
||||||
|
for img in root.xpath('//img'):
|
||||||
|
if img.get('style', '') != 'display:none':
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def do_split(self, tree, split_point, before):
|
||||||
|
'''
|
||||||
|
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||||
|
preserving tag structure, but not duplicating any text.
|
||||||
|
All tags that have had their text and tail
|
||||||
|
removed have the attribute ``calibre_split`` set to 1.
|
||||||
|
|
||||||
|
:param before: If True tree is split before split_point, otherwise after split_point
|
||||||
|
:return: before_tree, after_tree
|
||||||
|
'''
|
||||||
|
path = tree.getpath(split_point)
|
||||||
|
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
|
||||||
|
root = tree.getroot()
|
||||||
|
root2 = tree2.getroot()
|
||||||
|
body, body2 = root.body, root2.body
|
||||||
|
split_point = root.xpath(path)[0]
|
||||||
split_point2 = root2.xpath(path)[0]
|
split_point2 = root2.xpath(path)[0]
|
||||||
|
|
||||||
def nix_element(elem, top=True):
|
def nix_element(elem, top=True):
|
||||||
@ -129,7 +194,7 @@ class Splitter(LoggingInterface):
|
|||||||
elem.tail = u''
|
elem.tail = u''
|
||||||
elem.set(SPLIT_ATTR, '1')
|
elem.set(SPLIT_ATTR, '1')
|
||||||
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
||||||
elem.set('style', 'display:none;')
|
elem.set('style', 'display:none')
|
||||||
|
|
||||||
def fix_split_point(sp):
|
def fix_split_point(sp):
|
||||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||||
@ -163,20 +228,35 @@ class Splitter(LoggingInterface):
|
|||||||
if not hit_split_point:
|
if not hit_split_point:
|
||||||
nix_element(elem, top=False)
|
nix_element(elem, top=False)
|
||||||
|
|
||||||
for t, r in [(tree, root), (tree2, root2)]:
|
return tree, tree2
|
||||||
size = len(tostring(r))
|
|
||||||
if size <= self.opts.profile.flow_size:
|
|
||||||
self.trees.append(t)
|
|
||||||
#print tostring(t.getroot(), pretty_print=True)
|
|
||||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.)
|
|
||||||
self.split_size += size
|
|
||||||
else:
|
|
||||||
self.split(t)
|
|
||||||
|
|
||||||
|
|
||||||
|
def split_on_page_breaks(self, orig_tree):
|
||||||
|
ordered_ids = []
|
||||||
|
for elem in orig_tree.xpath('//*[@id]'):
|
||||||
|
id = elem.get('id')
|
||||||
|
if id in self.page_break_ids:
|
||||||
|
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
||||||
|
|
||||||
|
self.trees = []
|
||||||
|
tree = orig_tree
|
||||||
|
for pattern, before in ordered_ids:
|
||||||
|
self.log_info('\t\tSplitting on page-break')
|
||||||
|
elem = pattern(tree)
|
||||||
|
if elem:
|
||||||
|
before, after = self.do_split(tree, elem[0], before)
|
||||||
|
self.trees.append(before)
|
||||||
|
tree = after
|
||||||
|
self.trees.append(tree)
|
||||||
|
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_page_breaks(self, stylesheets, root):
|
def find_page_breaks(self, stylesheets, root):
|
||||||
'''
|
'''
|
||||||
Find all elements that have either page-break-before or page-break-after set.
|
Find all elements that have either page-break-before or page-break-after set.
|
||||||
|
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||||
|
have ids, an id is created).
|
||||||
'''
|
'''
|
||||||
page_break_selectors = set([])
|
page_break_selectors = set([])
|
||||||
for rule in rules(stylesheets):
|
for rule in rules(stylesheets):
|
||||||
@ -204,16 +284,18 @@ class Splitter(LoggingInterface):
|
|||||||
|
|
||||||
page_breaks = list(page_breaks)
|
page_breaks = list(page_breaks)
|
||||||
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
||||||
|
self.page_break_ids = []
|
||||||
for i, x in enumerate(page_breaks):
|
for i, x in enumerate(page_breaks):
|
||||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||||
self.page_breaks.append((XPath('//*[@id="%s"]'%x.get('id')), x.pb_before))
|
id = x.get('id')
|
||||||
|
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
||||||
|
self.page_break_ids.append(id)
|
||||||
|
|
||||||
|
|
||||||
def find_split_point(self, root):
|
def find_split_point(self, root):
|
||||||
'''
|
'''
|
||||||
Find the tag at which to split the tree rooted at `root`.
|
Find the tag at which to split the tree rooted at `root`.
|
||||||
Search order is:
|
Search order is:
|
||||||
* page breaks
|
|
||||||
* Heading tags
|
* Heading tags
|
||||||
* <div> tags
|
* <div> tags
|
||||||
* <p> tags
|
* <p> tags
|
||||||
@ -229,19 +311,6 @@ class Splitter(LoggingInterface):
|
|||||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||||
return elems[i]
|
return elems[i]
|
||||||
|
|
||||||
page_breaks = []
|
|
||||||
for x in self.page_breaks:
|
|
||||||
pb = x[0](root)
|
|
||||||
if pb:
|
|
||||||
page_breaks.append(pb[0])
|
|
||||||
|
|
||||||
elem = pick_elem(page_breaks)
|
|
||||||
if elem is not None:
|
|
||||||
i = page_breaks.index(elem)
|
|
||||||
return elem, self.page_breaks[i][1]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for path in (
|
for path in (
|
||||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||||
'/html/body/div',
|
'/html/body/div',
|
||||||
@ -354,7 +423,8 @@ def fix_ncx(path, changes):
|
|||||||
anchor_maps = [f.anchor_map for f in changes]
|
anchor_maps = [f.anchor_map for f in changes]
|
||||||
tree = etree.parse(path)
|
tree = etree.parse(path)
|
||||||
changed = False
|
changed = False
|
||||||
for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
for content in tree.getroot().xpath('//x:content[@src]',
|
||||||
|
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
||||||
href = content.get('src')
|
href = content.get('src')
|
||||||
if not href.startswith('#'):
|
if not href.startswith('#'):
|
||||||
href = href.split('#')
|
href = href.split('#')
|
||||||
@ -367,39 +437,37 @@ def fix_ncx(path, changes):
|
|||||||
changed = True
|
changed = True
|
||||||
if changed:
|
if changed:
|
||||||
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
|
open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
|
||||||
|
|
||||||
|
def find_html_files(opf):
|
||||||
|
'''
|
||||||
|
Find all HTML files referenced by `opf`.
|
||||||
|
'''
|
||||||
|
html_files = []
|
||||||
|
for item in opf.itermanifest():
|
||||||
|
if 'html' in item.get('media-type', '').lower():
|
||||||
|
f = item.get('href').split('/')[-1]
|
||||||
|
f2 = f.replace('&', '%26')
|
||||||
|
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
|
||||||
|
f = f2
|
||||||
|
item.set('href', item.get('href').replace('&', '%26'))
|
||||||
|
if os.path.exists(content(f)):
|
||||||
|
html_files.append(f)
|
||||||
|
return html_files
|
||||||
|
|
||||||
|
|
||||||
def split(pathtoopf, opts, stylesheet_map):
|
def split(pathtoopf, opts, stylesheet_map):
|
||||||
pathtoopf = os.path.abspath(pathtoopf)
|
pathtoopf = os.path.abspath(pathtoopf)
|
||||||
|
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
||||||
|
|
||||||
with CurrentDir(os.path.dirname(pathtoopf)):
|
with CurrentDir(os.path.dirname(pathtoopf)):
|
||||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
html_files = find_html_files(opf)
|
||||||
html_files = []
|
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
||||||
for item in opf.itermanifest():
|
changes = [c for c in changes if c.was_split]
|
||||||
if 'html' in item.get('media-type', '').lower():
|
|
||||||
f = item.get('href').split('/')[-1]
|
|
||||||
f2 = f.replace('&', '%26')
|
|
||||||
if not os.path.exists(content(f)) and os.path.exists(content(f2)):
|
|
||||||
f = f2
|
|
||||||
item.set('href', item.get('href').replace('&', '%26'))
|
|
||||||
html_files.append(f)
|
|
||||||
changes = []
|
|
||||||
always_remove = not opts.preserve_tag_structure
|
|
||||||
for f in html_files:
|
|
||||||
if os.stat(content(f)).st_size > opts.profile.flow_size:
|
|
||||||
try:
|
|
||||||
changes.append(Splitter(f, opts, stylesheet_map,
|
|
||||||
always_remove=(always_remove or \
|
|
||||||
os.stat(content(f)).st_size > 5*opts.profile.flow_size)))
|
|
||||||
except (SplitError, RuntimeError):
|
|
||||||
if not always_remove:
|
|
||||||
changes.append(Splitter(f, opts, stylesheet_map, always_remove=True))
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
changes[-1].fix_opf(opf)
|
|
||||||
|
|
||||||
open(pathtoopf, 'wb').write(opf.render())
|
|
||||||
fix_content_links(html_files, changes, opts)
|
fix_content_links(html_files, changes, opts)
|
||||||
|
|
||||||
for item in opf.itermanifest():
|
for item in opf.itermanifest():
|
||||||
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
||||||
fix_ncx(item.get('href'), changes)
|
fix_ncx(item.get('href'), changes)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
open(pathtoopf, 'wb').write(opf.render())
|
Loading…
x
Reference in New Issue
Block a user