mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Implement support for splitting large <pre> tags. Fixes #1094 (EPUB Conversion Error)
This commit is contained in:
parent
80dad7f79a
commit
2eb80adcd1
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Split the flows in an epub file to conform to size limitations.
|
Split the flows in an epub file to conform to size limitations.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, math, copy, logging, functools, collections
|
import os, math, logging, functools, collections, re, copy
|
||||||
|
|
||||||
from lxml.etree import XPath as _XPath
|
from lxml.etree import XPath as _XPath
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
@ -72,7 +72,25 @@ class Splitter(LoggingInterface):
|
|||||||
for f in self.files:
|
for f in self.files:
|
||||||
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
|
||||||
self.trees = None
|
self.trees = None
|
||||||
|
|
||||||
|
def split_text(self, text, root, size):
|
||||||
|
self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||||
|
rest = text.replace('\r', '')
|
||||||
|
parts = re.split('\n\n', rest)
|
||||||
|
self.log_debug('\t\t\t\tFound %d parts'%len(parts))
|
||||||
|
if max(map(len, parts)) > size:
|
||||||
|
raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
|
||||||
|
ans = []
|
||||||
|
buf = ''
|
||||||
|
for part in parts:
|
||||||
|
if len(buf) + len(part) < size:
|
||||||
|
buf += '\n\n'+part
|
||||||
|
else:
|
||||||
|
ans.append(buf)
|
||||||
|
buf = part
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def split(self, tree):
|
def split(self, tree):
|
||||||
'''
|
'''
|
||||||
Split ``tree`` into a *before* and *after* tree, preserving tag structure,
|
Split ``tree`` into a *before* and *after* tree, preserving tag structure,
|
||||||
@ -81,6 +99,25 @@ class Splitter(LoggingInterface):
|
|||||||
'''
|
'''
|
||||||
self.log_debug('\t\tSplitting...')
|
self.log_debug('\t\tSplitting...')
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
|
# Split large <pre> tags
|
||||||
|
for pre in list(root.xpath('//pre')):
|
||||||
|
text = u''.join(pre.xpath('./text()'))
|
||||||
|
pre.text = text
|
||||||
|
for child in list(pre.iterdescendants()):
|
||||||
|
pre.remove(child)
|
||||||
|
if len(pre.text) > self.opts.profile.flow_size*0.5:
|
||||||
|
frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
|
||||||
|
new_pres = []
|
||||||
|
for frag in frags:
|
||||||
|
pre2 = copy.copy(pre)
|
||||||
|
pre2.text = frag
|
||||||
|
pre2.tail = u''
|
||||||
|
new_pres.append(pre2)
|
||||||
|
new_pres[-1].tail = pre.tail
|
||||||
|
p = pre.getparent()
|
||||||
|
i = p.index(pre)
|
||||||
|
p[i:i+1] = new_pres
|
||||||
|
|
||||||
split_point, before = self.find_split_point(root)
|
split_point, before = self.find_split_point(root)
|
||||||
if split_point is None or self.split_size > 6*self.orig_size:
|
if split_point is None or self.split_size > 6*self.orig_size:
|
||||||
if not self.always_remove:
|
if not self.always_remove:
|
||||||
@ -219,10 +256,21 @@ class Splitter(LoggingInterface):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
|
for path in (
|
||||||
|
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||||
|
'/html/body/div',
|
||||||
|
'//pre',
|
||||||
|
'//hr',
|
||||||
|
'//p',
|
||||||
|
'//br',
|
||||||
|
):
|
||||||
elems = root.xpath(path)
|
elems = root.xpath(path)
|
||||||
elem = pick_elem(elems)
|
elem = pick_elem(elems)
|
||||||
if elem is not None:
|
if elem is not None:
|
||||||
|
try:
|
||||||
|
XPath(elem.getroottree().getpath(elem))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
return elem, True
|
return elem, True
|
||||||
|
|
||||||
return None, True
|
return None, True
|
||||||
|
@ -378,7 +378,14 @@ class Parser(PreProcessor, LoggingInterface):
|
|||||||
self.log_exception('lxml based parsing failed')
|
self.log_exception('lxml based parsing failed')
|
||||||
self.root = soupparser.fromstring(src)
|
self.root = soupparser.fromstring(src)
|
||||||
head = self.root.xpath('./head')
|
head = self.root.xpath('./head')
|
||||||
self.head = head[0] if head else etree.SubElement(self.root, 'head')
|
if head:
|
||||||
|
head = head[0]
|
||||||
|
else:
|
||||||
|
head = etree.SubElement(self.root, 'head')
|
||||||
|
self.root.remove(head)
|
||||||
|
self.root.insert(0, head)
|
||||||
|
|
||||||
|
self.head = head
|
||||||
self.body = self.root.body
|
self.body = self.root.body
|
||||||
for a in self.root.xpath('//a[@name]'):
|
for a in self.root.xpath('//a[@name]'):
|
||||||
a.set('id', a.get('name'))
|
a.set('id', a.get('name'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user