From 8652cff214d115e4c48444f47af68fbc01ca22f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Sep 2008 16:37:31 -0700 Subject: [PATCH] Fix #1054 (Any2epub fails) and put in a check to detect files that have a large markup:text ratio during splitting. --- src/calibre/ebooks/epub/split.py | 9 ++++++--- src/calibre/ebooks/mobi/reader.py | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 4e17d19a34..0fbcac4bec 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -42,7 +42,8 @@ class Splitter(LoggingInterface): self.always_remove = always_remove self.base = os.path.splitext(path)[0] + '_split_%d.html' self.opts = opts - self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.) + self.orig_size = os.stat(content(path)).st_size + self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root) @@ -61,8 +62,9 @@ class Splitter(LoggingInterface): self.page_breaks = [] if stylesheet is not None: self.find_page_breaks(stylesheet, root) - + self.trees = [] + self.split_size = 0 self.split(root.getroottree()) self.commit() self.log_info('\t\tSplit into %d parts.', len(self.trees)) @@ -80,7 +82,7 @@ class Splitter(LoggingInterface): self.log_debug('\t\tSplitting...') root = tree.getroot() split_point, before = self.find_split_point(root) - if split_point is None: + if split_point is None:# or self.split_size > 6*self.orig_size: if not self.always_remove: self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.')) raise SplitError(self.path, root) @@ -144,6 +146,7 @@ class Splitter(LoggingInterface): if size <= self.opts.profile.flow_size: self.trees.append(t) self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.) + self.split_size += size else: self.split(t) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 55c6be0ae9..2c9969421a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -350,6 +350,8 @@ class MobiReader(object): pos = 0 self.processed_html = '' for end in positions: + if end == 0: + continue oend = end l = self.mobi_html.find('<', end) r = self.mobi_html.find('>', end)