Fix #1054 (Any2epub fails) and put in a check to detect files that have a large markup:text ratio during splitting.

2026-02-14 23:42:13 -05:00 · 2008-09-24 16:37:31 -07:00 · 2008-09-24 16:37:31 -07:00 · 8652cff214
commit 8652cff214
parent eadb7172c4
2 changed files with 8 additions and 3 deletions
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -42,7 +42,8 @@ class Splitter(LoggingInterface):
        self.always_remove = always_remove
        self.base = os.path.splitext(path)[0] + '_split_%d.html'
        self.opts = opts
-        self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.)
+        self.orig_size = os.stat(content(path)).st_size
+        self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
        root = html.fromstring(open(content(path)).read())
            
        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
@ -61,8 +62,9 @@ class Splitter(LoggingInterface):
        self.page_breaks = []
        if stylesheet is not None:
            self.find_page_breaks(stylesheet, root)
-        
+            
        self.trees = []
+        self.split_size = 0
        self.split(root.getroottree())
        self.commit()
        self.log_info('\t\tSplit into %d parts.', len(self.trees))
@ -80,7 +82,7 @@ class Splitter(LoggingInterface):
        self.log_debug('\t\tSplitting...')
        root = tree.getroot()
        split_point, before = self.find_split_point(root)
-        if split_point is None:
+        if split_point is None:# or self.split_size > 6*self.orig_size:
            if not self.always_remove:
                self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
            raise SplitError(self.path, root)
@ -144,6 +146,7 @@ class Splitter(LoggingInterface):
            if size <= self.opts.profile.flow_size:
                self.trees.append(t)
                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.)
+                self.split_size += size
            else:
                self.split(t)
                
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -350,6 +350,8 @@ class MobiReader(object):
        pos = 0
        self.processed_html = ''
        for end in positions:
+            if end == 0:
+                continue
            oend = end
            l = self.mobi_html.find('<', end)
            r = self.mobi_html.find('>', end)