Implement support for splitting large <pre> tags. Fixes #1094 (EPUB Conversion Error)

2025-11-13 10:06:59 -05:00 · 2008-09-29 13:14:01 -07:00 · 2008-09-29 13:14:01 -07:00 · 2eb80adcd1
commit 2eb80adcd1
parent 80dad7f79a
2 changed files with 59 additions and 4 deletions
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''

-import os, math, copy, logging, functools, collections
+import os, math, logging, functools, collections, re, copy

 from lxml.etree import XPath as _XPath
 from lxml import etree, html
@ -73,6 +73,24 @@ class Splitter(LoggingInterface):
                self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
        self.trees = None
    
+    def split_text(self, text, root, size):
+        self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
+        rest = text.replace('\r', '')
+        parts = re.split('\n\n', rest)
+        self.log_debug('\t\t\t\tFound %d parts'%len(parts))
+        if max(map(len, parts)) > size:
+            raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) 
+        ans = []
+        buf = ''
+        for part in parts:
+            if len(buf) + len(part) < size:
+                buf += '\n\n'+part
+            else:
+                ans.append(buf)
+                buf = part
+        return ans
+            
+    
    def split(self, tree):
        '''
        Split ``tree`` into a *before* and *after* tree, preserving tag structure,
@ -81,6 +99,25 @@ class Splitter(LoggingInterface):
        '''
        self.log_debug('\t\tSplitting...')
        root = tree.getroot()
+        # Split large <pre> tags
+        for pre in list(root.xpath('//pre')):
+            text = u''.join(pre.xpath('./text()'))
+            pre.text = text
+            for child in list(pre.iterdescendants()):
+                pre.remove(child)
+            if len(pre.text) > self.opts.profile.flow_size*0.5:
+                frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
+                new_pres = []
+                for frag in frags:
+                    pre2 = copy.copy(pre)
+                    pre2.text = frag
+                    pre2.tail = u''
+                    new_pres.append(pre2)
+                new_pres[-1].tail = pre.tail
+                p = pre.getparent()
+                i = p.index(pre)
+                p[i:i+1] = new_pres
+        
        split_point, before = self.find_split_point(root)
        if split_point is None or self.split_size > 6*self.orig_size:
            if not self.always_remove:
@ -219,10 +256,21 @@ class Splitter(LoggingInterface):
        
            
                            
-        for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
+        for path in (
+                     '//*[re:match(name(), "h[1-6]", "i")]', 
+                     '/html/body/div',
+                     '//pre',
+                     '//hr', 
+                     '//p',
+                     '//br',
+                     ):
            elems = root.xpath(path)
            elem = pick_elem(elems)
            if elem is not None:
+                try:
+                    XPath(elem.getroottree().getpath(elem))
+                except:
+                    continue
                return elem, True
            
        return None, True
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -378,7 +378,14 @@ class Parser(PreProcessor, LoggingInterface):
                self.log_exception('lxml based parsing failed')
            self.root = soupparser.fromstring(src)
        head = self.root.xpath('./head')
-        self.head = head[0] if head else etree.SubElement(self.root, 'head')
+        if head:
+            head = head[0]
+        else:
+            head = etree.SubElement(self.root, 'head')
+            self.root.remove(head)
+            self.root.insert(0, head)
+
+        self.head = head 
        self.body = self.root.body
        for a in self.root.xpath('//a[@name]'):
            a.set('id', a.get('name'))