mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #2358 (HTML to ePub conversion results in duplicated content / incorrect breaks)
This commit is contained in:
parent
6f072dc3d1
commit
e869684a29
@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs'
|
|||||||
SPLIT_POINT_ATTR = 'csp'
|
SPLIT_POINT_ATTR = 'csp'
|
||||||
|
|
||||||
class SplitError(ValueError):
|
class SplitError(ValueError):
|
||||||
|
|
||||||
def __init__(self, path, root):
|
def __init__(self, path, root):
|
||||||
size = len(tostring(root))/1024.
|
size = len(tostring(root))/1024.
|
||||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
|
||||||
(os.path.basename(path), size))
|
(os.path.basename(path), size))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Splitter(LoggingInterface):
|
class Splitter(LoggingInterface):
|
||||||
|
|
||||||
def __init__(self, path, opts, stylesheet_map, opf):
|
def __init__(self, path, opts, stylesheet_map, opf):
|
||||||
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
|
LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
|
||||||
self.setup_cli_handler(opts.verbose)
|
self.setup_cli_handler(opts.verbose)
|
||||||
@ -45,10 +45,10 @@ class Splitter(LoggingInterface):
|
|||||||
self.orig_size = os.stat(content(path)).st_size
|
self.orig_size = os.stat(content(path)).st_size
|
||||||
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
|
||||||
root = html.fromstring(open(content(path)).read())
|
root = html.fromstring(open(content(path)).read())
|
||||||
|
|
||||||
self.page_breaks, self.trees = [], []
|
self.page_breaks, self.trees = [], []
|
||||||
self.split_size = 0
|
self.split_size = 0
|
||||||
|
|
||||||
# Split on page breaks
|
# Split on page breaks
|
||||||
self.splitting_on_page_breaks = True
|
self.splitting_on_page_breaks = True
|
||||||
if not opts.dont_split_on_page_breaks:
|
if not opts.dont_split_on_page_breaks:
|
||||||
@ -60,29 +60,36 @@ class Splitter(LoggingInterface):
|
|||||||
else:
|
else:
|
||||||
self.trees = [root.getroottree()]
|
self.trees = [root.getroottree()]
|
||||||
trees = list(self.trees)
|
trees = list(self.trees)
|
||||||
|
|
||||||
# Split any remaining over-sized trees
|
# Split any remaining over-sized trees
|
||||||
self.splitting_on_page_breaks = False
|
self.splitting_on_page_breaks = False
|
||||||
if self.opts.profile.flow_size < sys.maxint:
|
if self.opts.profile.flow_size < sys.maxint:
|
||||||
lt_found = False
|
lt_found = False
|
||||||
self.log_info('\tLooking for large trees...')
|
self.log_info('\tLooking for large trees...')
|
||||||
|
self.tree_map = {}
|
||||||
for i, tree in enumerate(list(trees)):
|
for i, tree in enumerate(list(trees)):
|
||||||
self.trees = []
|
self.split_trees = []
|
||||||
size = len(tostring(tree.getroot()))
|
size = len(tostring(tree.getroot()))
|
||||||
if size > self.opts.profile.flow_size:
|
if size > self.opts.profile.flow_size:
|
||||||
lt_found = True
|
lt_found = True
|
||||||
try:
|
try:
|
||||||
self.split_to_size(tree)
|
self.split_to_size(tree)
|
||||||
|
self.tree_map[tree] = self.split_trees
|
||||||
except (SplitError, RuntimeError): # Splitting fails
|
except (SplitError, RuntimeError): # Splitting fails
|
||||||
if not self.always_remove:
|
if not self.always_remove:
|
||||||
self.always_remove = True
|
self.always_remove = True
|
||||||
|
self.split_trees = []
|
||||||
self.split_to_size(tree)
|
self.split_to_size(tree)
|
||||||
|
self.tree_map[tree] = self.split_trees
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
trees[i:i+1] = list(self.trees)
|
t = []
|
||||||
|
for x in trees:
|
||||||
|
t.extend(self.tree_map.get(x, [x]))
|
||||||
|
trees = t
|
||||||
if not lt_found:
|
if not lt_found:
|
||||||
self.log_info('\tNo large trees found')
|
self.log_info('\tNo large trees found')
|
||||||
|
|
||||||
self.trees = trees
|
self.trees = trees
|
||||||
self.was_split = len(self.trees) > 1
|
self.was_split = len(self.trees) > 1
|
||||||
if self.was_split:
|
if self.was_split:
|
||||||
@ -92,17 +99,17 @@ class Splitter(LoggingInterface):
|
|||||||
for f in self.files:
|
for f in self.files:
|
||||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||||
self.fix_opf(opf)
|
self.fix_opf(opf)
|
||||||
|
|
||||||
self.trees = None
|
self.trees = None
|
||||||
|
|
||||||
|
|
||||||
def split_text(self, text, root, size):
|
def split_text(self, text, root, size):
|
||||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||||
rest = text.replace('\r', '')
|
rest = text.replace('\r', '')
|
||||||
parts = re.split('\n\n', rest)
|
parts = re.split('\n\n', rest)
|
||||||
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
||||||
if max(map(len, parts)) > size:
|
if max(map(len, parts)) > size:
|
||||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||||
ans = []
|
ans = []
|
||||||
buf = ''
|
buf = ''
|
||||||
for part in parts:
|
for part in parts:
|
||||||
@ -112,8 +119,8 @@ class Splitter(LoggingInterface):
|
|||||||
ans.append(buf)
|
ans.append(buf)
|
||||||
buf = part
|
buf = part
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def split_to_size(self, tree):
|
def split_to_size(self, tree):
|
||||||
self.log_debug('\t\tSplitting...')
|
self.log_debug('\t\tSplitting...')
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
@ -135,7 +142,7 @@ class Splitter(LoggingInterface):
|
|||||||
p = pre.getparent()
|
p = pre.getparent()
|
||||||
i = p.index(pre)
|
i = p.index(pre)
|
||||||
p[i:i+1] = new_pres
|
p[i:i+1] = new_pres
|
||||||
|
|
||||||
split_point, before = self.find_split_point(root)
|
split_point, before = self.find_split_point(root)
|
||||||
if split_point is None or self.split_size > 6*self.orig_size:
|
if split_point is None or self.split_size > 6*self.orig_size:
|
||||||
if not self.always_remove:
|
if not self.always_remove:
|
||||||
@ -143,21 +150,21 @@ class Splitter(LoggingInterface):
|
|||||||
'structure preservation. This may cause '
|
'structure preservation. This may cause '
|
||||||
'incorrect rendering.'))
|
'incorrect rendering.'))
|
||||||
raise SplitError(self.path, root)
|
raise SplitError(self.path, root)
|
||||||
|
|
||||||
for t in self.do_split(tree, split_point, before):
|
for t in self.do_split(tree, split_point, before):
|
||||||
r = t.getroot()
|
r = t.getroot()
|
||||||
if self.is_page_empty(r):
|
if self.is_page_empty(r):
|
||||||
continue
|
continue
|
||||||
size = len(tostring(r))
|
size = len(tostring(r))
|
||||||
if size <= self.opts.profile.flow_size:
|
if size <= self.opts.profile.flow_size:
|
||||||
self.trees.append(t)
|
self.split_trees.append(t)
|
||||||
#print tostring(t.getroot(), pretty_print=True)
|
#print tostring(t.getroot(), pretty_print=True)
|
||||||
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
|
||||||
len(self.trees), size/1024.)
|
len(self.split_trees), size/1024.)
|
||||||
self.split_size += size
|
self.split_size += size
|
||||||
else:
|
else:
|
||||||
self.split_to_size(t)
|
self.split_to_size(t)
|
||||||
|
|
||||||
def is_page_empty(self, root):
|
def is_page_empty(self, root):
|
||||||
body = root.find('body')
|
body = root.find('body')
|
||||||
if body is None:
|
if body is None:
|
||||||
@ -171,14 +178,14 @@ class Splitter(LoggingInterface):
|
|||||||
if img.get('style', '') != 'display:none':
|
if img.get('style', '') != 'display:none':
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def do_split(self, tree, split_point, before):
|
def do_split(self, tree, split_point, before):
|
||||||
'''
|
'''
|
||||||
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
Split ``tree`` into a *before* and *after* tree at ``split_point``,
|
||||||
preserving tag structure, but not duplicating any text.
|
preserving tag structure, but not duplicating any text.
|
||||||
All tags that have had their text and tail
|
All tags that have had their text and tail
|
||||||
removed have the attribute ``calibre_split`` set to 1.
|
removed have the attribute ``calibre_split`` set to 1.
|
||||||
|
|
||||||
:param before: If True tree is split before split_point, otherwise after split_point
|
:param before: If True tree is split before split_point, otherwise after split_point
|
||||||
:return: before_tree, after_tree
|
:return: before_tree, after_tree
|
||||||
'''
|
'''
|
||||||
@ -189,7 +196,7 @@ class Splitter(LoggingInterface):
|
|||||||
body, body2 = root.body, root2.body
|
body, body2 = root.body, root2.body
|
||||||
split_point = root.xpath(path)[0]
|
split_point = root.xpath(path)[0]
|
||||||
split_point2 = root2.xpath(path)[0]
|
split_point2 = root2.xpath(path)[0]
|
||||||
|
|
||||||
def nix_element(elem, top=True):
|
def nix_element(elem, top=True):
|
||||||
if self.always_remove:
|
if self.always_remove:
|
||||||
parent = elem.getparent()
|
parent = elem.getparent()
|
||||||
@ -199,18 +206,18 @@ class Splitter(LoggingInterface):
|
|||||||
else:
|
else:
|
||||||
index = parent.index(elem)
|
index = parent.index(elem)
|
||||||
parent[index:index+1] = list(elem.iterchildren())
|
parent[index:index+1] = list(elem.iterchildren())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
elem.text = u''
|
elem.text = u''
|
||||||
elem.tail = u''
|
elem.tail = u''
|
||||||
elem.set(SPLIT_ATTR, '1')
|
elem.set(SPLIT_ATTR, '1')
|
||||||
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
|
||||||
elem.set('style', 'display:none')
|
elem.set('style', 'display:none')
|
||||||
|
|
||||||
def fix_split_point(sp):
|
def fix_split_point(sp):
|
||||||
if not self.splitting_on_page_breaks:
|
if not self.splitting_on_page_breaks:
|
||||||
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
|
||||||
|
|
||||||
# Tree 1
|
# Tree 1
|
||||||
hit_split_point = False
|
hit_split_point = False
|
||||||
for elem in list(body.iterdescendants(etree.Element)):
|
for elem in list(body.iterdescendants(etree.Element)):
|
||||||
@ -224,8 +231,8 @@ class Splitter(LoggingInterface):
|
|||||||
continue
|
continue
|
||||||
if hit_split_point:
|
if hit_split_point:
|
||||||
nix_element(elem)
|
nix_element(elem)
|
||||||
|
|
||||||
|
|
||||||
# Tree 2
|
# Tree 2
|
||||||
hit_split_point = False
|
hit_split_point = False
|
||||||
for elem in list(body2.iterdescendants(etree.Element)):
|
for elem in list(body2.iterdescendants(etree.Element)):
|
||||||
@ -239,17 +246,17 @@ class Splitter(LoggingInterface):
|
|||||||
continue
|
continue
|
||||||
if not hit_split_point:
|
if not hit_split_point:
|
||||||
nix_element(elem, top=False)
|
nix_element(elem, top=False)
|
||||||
|
|
||||||
return tree, tree2
|
return tree, tree2
|
||||||
|
|
||||||
|
|
||||||
def split_on_page_breaks(self, orig_tree):
|
def split_on_page_breaks(self, orig_tree):
|
||||||
ordered_ids = []
|
ordered_ids = []
|
||||||
for elem in orig_tree.xpath('//*[@id]'):
|
for elem in orig_tree.xpath('//*[@id]'):
|
||||||
id = elem.get('id')
|
id = elem.get('id')
|
||||||
if id in self.page_break_ids:
|
if id in self.page_break_ids:
|
||||||
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
|
||||||
|
|
||||||
self.trees = []
|
self.trees = []
|
||||||
tree = orig_tree
|
tree = orig_tree
|
||||||
for pattern, before in ordered_ids:
|
for pattern, before in ordered_ids:
|
||||||
@ -261,13 +268,13 @@ class Splitter(LoggingInterface):
|
|||||||
tree = after
|
tree = after
|
||||||
self.trees.append(tree)
|
self.trees.append(tree)
|
||||||
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_page_breaks(self, stylesheets, root):
|
def find_page_breaks(self, stylesheets, root):
|
||||||
'''
|
'''
|
||||||
Find all elements that have either page-break-before or page-break-after set.
|
Find all elements that have either page-break-before or page-break-after set.
|
||||||
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
Populates `self.page_breaks` with id based XPath selectors (for elements that don't
|
||||||
have ids, an id is created).
|
have ids, an id is created).
|
||||||
'''
|
'''
|
||||||
page_break_selectors = set([])
|
page_break_selectors = set([])
|
||||||
@ -284,16 +291,16 @@ class Splitter(LoggingInterface):
|
|||||||
page_break_selectors.add((CSSSelector(rule.selectorText), False))
|
page_break_selectors.add((CSSSelector(rule.selectorText), False))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
page_breaks = set([])
|
page_breaks = set([])
|
||||||
for selector, before in page_break_selectors:
|
for selector, before in page_break_selectors:
|
||||||
for elem in selector(root):
|
for elem in selector(root):
|
||||||
elem.pb_before = before
|
elem.pb_before = before
|
||||||
page_breaks.add(elem)
|
page_breaks.add(elem)
|
||||||
|
|
||||||
for i, elem in enumerate(root.iter()):
|
for i, elem in enumerate(root.iter()):
|
||||||
elem.pb_order = i
|
elem.pb_order = i
|
||||||
|
|
||||||
page_breaks = list(page_breaks)
|
page_breaks = list(page_breaks)
|
||||||
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
|
||||||
self.page_break_ids = []
|
self.page_break_ids = []
|
||||||
@ -301,12 +308,12 @@ class Splitter(LoggingInterface):
|
|||||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||||
id = x.get('id')
|
id = x.get('id')
|
||||||
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
|
||||||
self.page_break_ids.append(id)
|
self.page_break_ids.append(id)
|
||||||
|
|
||||||
|
|
||||||
def find_split_point(self, root):
|
def find_split_point(self, root):
|
||||||
'''
|
'''
|
||||||
Find the tag at which to split the tree rooted at `root`.
|
Find the tag at which to split the tree rooted at `root`.
|
||||||
Search order is:
|
Search order is:
|
||||||
* Heading tags
|
* Heading tags
|
||||||
* <div> tags
|
* <div> tags
|
||||||
@ -315,7 +322,7 @@ class Splitter(LoggingInterface):
|
|||||||
* <p> tags
|
* <p> tags
|
||||||
* <br> tags
|
* <br> tags
|
||||||
* <li> tags
|
* <li> tags
|
||||||
|
|
||||||
We try to split in the "middle" of the file (as defined by tag counts.
|
We try to split in the "middle" of the file (as defined by tag counts.
|
||||||
'''
|
'''
|
||||||
def pick_elem(elems):
|
def pick_elem(elems):
|
||||||
@ -326,18 +333,18 @@ class Splitter(LoggingInterface):
|
|||||||
i = int(math.floor(len(elems)/2.))
|
i = int(math.floor(len(elems)/2.))
|
||||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||||
return elems[i]
|
return elems[i]
|
||||||
|
|
||||||
for path in (
|
for path in (
|
||||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||||
'/html/body/div',
|
'/html/body/div',
|
||||||
'//pre',
|
'//pre',
|
||||||
'//hr',
|
'//hr',
|
||||||
'//p',
|
'//p',
|
||||||
'//div',
|
'//div',
|
||||||
'//br',
|
'//br',
|
||||||
'//li',
|
'//li',
|
||||||
):
|
):
|
||||||
elems = root.xpath(path,
|
elems = root.xpath(path,
|
||||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
namespaces={'re':'http://exslt.org/regular-expressions'})
|
||||||
elem = pick_elem(elems)
|
elem = pick_elem(elems)
|
||||||
if elem is not None:
|
if elem is not None:
|
||||||
@ -346,9 +353,9 @@ class Splitter(LoggingInterface):
|
|||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
return elem, True
|
return elem, True
|
||||||
|
|
||||||
return None, True
|
return None, True
|
||||||
|
|
||||||
def commit(self):
|
def commit(self):
|
||||||
'''
|
'''
|
||||||
Commit all changes caused by the split. This removes the previously
|
Commit all changes caused by the split. This removes the previously
|
||||||
@ -358,7 +365,7 @@ class Splitter(LoggingInterface):
|
|||||||
'''
|
'''
|
||||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
||||||
self.files = []
|
self.files = []
|
||||||
|
|
||||||
for i, tree in enumerate(self.trees):
|
for i, tree in enumerate(self.trees):
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
self.files.append(self.base%i)
|
self.files.append(self.base%i)
|
||||||
@ -368,7 +375,7 @@ class Splitter(LoggingInterface):
|
|||||||
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
|
for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
|
||||||
elem.attrib.pop(SPLIT_ATTR, None)
|
elem.attrib.pop(SPLIT_ATTR, None)
|
||||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||||
|
|
||||||
for current, tree in zip(self.files, self.trees):
|
for current, tree in zip(self.files, self.trees):
|
||||||
for a in tree.getroot().xpath('//a[@href]'):
|
for a in tree.getroot().xpath('//a[@href]'):
|
||||||
href = a.get('href').strip()
|
href = a.get('href').strip()
|
||||||
@ -376,10 +383,10 @@ class Splitter(LoggingInterface):
|
|||||||
anchor = href[1:]
|
anchor = href[1:]
|
||||||
file = self.anchor_map[anchor]
|
file = self.anchor_map[anchor]
|
||||||
if file != current:
|
if file != current:
|
||||||
a.set('href', file+href)
|
a.set('href', file+href)
|
||||||
open(content(current), 'wb').\
|
open(content(current), 'wb').\
|
||||||
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
|
write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
|
||||||
|
|
||||||
os.remove(content(self.path))
|
os.remove(content(self.path))
|
||||||
|
|
||||||
|
|
||||||
@ -392,12 +399,12 @@ class Splitter(LoggingInterface):
|
|||||||
id_map = {}
|
id_map = {}
|
||||||
for item in items:
|
for item in items:
|
||||||
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
|
id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
|
||||||
|
|
||||||
for id in id_map.keys():
|
for id in id_map.keys():
|
||||||
opf.replace_spine_items_by_idref(id, id_map[id])
|
opf.replace_spine_items_by_idref(id, id_map[id])
|
||||||
|
|
||||||
for ref in opf.iterguide():
|
for ref in opf.iterguide():
|
||||||
href = ref.get('href', '')
|
href = ref.get('href', '')
|
||||||
if href.startswith('content/'+self.path):
|
if href.startswith('content/'+self.path):
|
||||||
href = href.split('#')
|
href = href.split('#')
|
||||||
frag = None
|
frag = None
|
||||||
@ -409,8 +416,8 @@ class Splitter(LoggingInterface):
|
|||||||
new_file = self.anchor_map[frag]
|
new_file = self.anchor_map[frag]
|
||||||
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
|
ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fix_content_links(html_files, changes, opts):
|
def fix_content_links(html_files, changes, opts):
|
||||||
split_files = [f.path for f in changes]
|
split_files = [f.path for f in changes]
|
||||||
anchor_maps = [f.anchor_map for f in changes]
|
anchor_maps = [f.anchor_map for f in changes]
|
||||||
@ -421,7 +428,7 @@ def fix_content_links(html_files, changes, opts):
|
|||||||
files[i:i+1] = changes[j].files
|
files[i:i+1] = changes[j].files
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for htmlfile in files:
|
for htmlfile in files:
|
||||||
changed = False
|
changed = False
|
||||||
root = html.fromstring(open(content(htmlfile), 'rb').read())
|
root = html.fromstring(open(content(htmlfile), 'rb').read())
|
||||||
@ -440,7 +447,7 @@ def fix_content_links(html_files, changes, opts):
|
|||||||
frag = ('#'+anchor) if anchor else ''
|
frag = ('#'+anchor) if anchor else ''
|
||||||
a.set('href', newf+frag)
|
a.set('href', newf+frag)
|
||||||
changed = True
|
changed = True
|
||||||
|
|
||||||
if changed:
|
if changed:
|
||||||
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
||||||
|
|
||||||
@ -449,7 +456,7 @@ def fix_ncx(path, changes):
|
|||||||
anchor_maps = [f.anchor_map for f in changes]
|
anchor_maps = [f.anchor_map for f in changes]
|
||||||
tree = etree.parse(path)
|
tree = etree.parse(path)
|
||||||
changed = False
|
changed = False
|
||||||
for content in tree.getroot().xpath('//x:content[@src]',
|
for content in tree.getroot().xpath('//x:content[@src]',
|
||||||
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
|
||||||
href = content.get('src')
|
href = content.get('src')
|
||||||
if not href.startswith('#'):
|
if not href.startswith('#'):
|
||||||
@ -482,21 +489,21 @@ def find_html_files(opf):
|
|||||||
if os.path.exists(content(f)):
|
if os.path.exists(content(f)):
|
||||||
html_files.append(f)
|
html_files.append(f)
|
||||||
return html_files
|
return html_files
|
||||||
|
|
||||||
|
|
||||||
def split(pathtoopf, opts, stylesheet_map):
|
def split(pathtoopf, opts, stylesheet_map):
|
||||||
pathtoopf = os.path.abspath(pathtoopf)
|
pathtoopf = os.path.abspath(pathtoopf)
|
||||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
||||||
|
|
||||||
with CurrentDir(os.path.dirname(pathtoopf)):
|
with CurrentDir(os.path.dirname(pathtoopf)):
|
||||||
html_files = find_html_files(opf)
|
html_files = find_html_files(opf)
|
||||||
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
|
||||||
changes = [c for c in changes if c.was_split]
|
changes = [c for c in changes if c.was_split]
|
||||||
|
|
||||||
fix_content_links(html_files, changes, opts)
|
fix_content_links(html_files, changes, opts)
|
||||||
for item in opf.itermanifest():
|
for item in opf.itermanifest():
|
||||||
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
if item.get('media-type', '') == 'application/x-dtbncx+xml':
|
||||||
fix_ncx(item.get('href'), changes)
|
fix_ncx(item.get('href'), changes)
|
||||||
break
|
break
|
||||||
|
|
||||||
open(pathtoopf, 'wb').write(opf.render())
|
open(pathtoopf, 'wb').write(opf.render())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user