mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Implement support for splitting large <pre> tags. Fixes #1094 (EPUB Conversion Error)
This commit is contained in:
parent
80dad7f79a
commit
2eb80adcd1
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
||||
Split the flows in an epub file to conform to size limitations.
|
||||
'''
|
||||
|
||||
import os, math, copy, logging, functools, collections
|
||||
import os, math, logging, functools, collections, re, copy
|
||||
|
||||
from lxml.etree import XPath as _XPath
|
||||
from lxml import etree, html
|
||||
@ -73,6 +73,24 @@ class Splitter(LoggingInterface):
|
||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||
self.trees = None
|
||||
|
||||
def split_text(self, text, root, size):
|
||||
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||
rest = text.replace('\r', '')
|
||||
parts = re.split('\n\n', rest)
|
||||
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
||||
if max(map(len, parts)) > size:
|
||||
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||
ans = []
|
||||
buf = ''
|
||||
for part in parts:
|
||||
if len(buf) + len(part) < size:
|
||||
buf += '\n\n'+part
|
||||
else:
|
||||
ans.append(buf)
|
||||
buf = part
|
||||
return ans
|
||||
|
||||
|
||||
def split(self, tree):
|
||||
'''
|
||||
Split ``tree`` into a *before* and *after* tree, preserving tag structure,
|
||||
@ -81,6 +99,25 @@ class Splitter(LoggingInterface):
|
||||
'''
|
||||
self.log_debug('\t\tSplitting...')
|
||||
root = tree.getroot()
|
||||
# Split large <pre> tags
|
||||
for pre in list(root.xpath('//pre')):
|
||||
text = u''.join(pre.xpath('./text()'))
|
||||
pre.text = text
|
||||
for child in list(pre.iterdescendants()):
|
||||
pre.remove(child)
|
||||
if len(pre.text) > self.opts.profile.flow_size*0.5:
|
||||
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
|
||||
new_pres = []
|
||||
for frag in frags:
|
||||
pre2 = copy.copy(pre)
|
||||
pre2.text = frag
|
||||
pre2.tail = u''
|
||||
new_pres.append(pre2)
|
||||
new_pres[-1].tail = pre.tail
|
||||
p = pre.getparent()
|
||||
i = p.index(pre)
|
||||
p[i:i+1] = new_pres
|
||||
|
||||
split_point, before = self.find_split_point(root)
|
||||
if split_point is None or self.split_size > 6*self.orig_size:
|
||||
if not self.always_remove:
|
||||
@ -219,10 +256,21 @@ class Splitter(LoggingInterface):
|
||||
|
||||
|
||||
|
||||
for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
|
||||
for path in (
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'/html/body/div',
|
||||
'//pre',
|
||||
'//hr',
|
||||
'//p',
|
||||
'//br',
|
||||
):
|
||||
elems = root.xpath(path)
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
try:
|
||||
XPath(elem.getroottree().getpath(elem))
|
||||
except:
|
||||
continue
|
||||
return elem, True
|
||||
|
||||
return None, True
|
||||
|
@ -378,7 +378,14 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
self.log_exception('lxml based parsing failed')
|
||||
self.root = soupparser.fromstring(src)
|
||||
head = self.root.xpath('./head')
|
||||
self.head = head[0] if head else etree.SubElement(self.root, 'head')
|
||||
if head:
|
||||
head = head[0]
|
||||
else:
|
||||
head = etree.SubElement(self.root, 'head')
|
||||
self.root.remove(head)
|
||||
self.root.insert(0, head)
|
||||
|
||||
self.head = head
|
||||
self.body = self.root.body
|
||||
for a in self.root.xpath('//a[@name]'):
|
||||
a.set('id', a.get('name'))
|
||||
|
Loading…
x
Reference in New Issue
Block a user