Fix #1054 (Any2epub fails) and put in a check to detect files that have a large markup:text ratio during splitting.

This commit is contained in:
Kovid Goyal 2008-09-24 16:37:31 -07:00
parent eadb7172c4
commit 8652cff214
2 changed files with 8 additions and 3 deletions

View File

@ -42,7 +42,8 @@ class Splitter(LoggingInterface):
self.always_remove = always_remove
self.base = os.path.splitext(path)[0] + '_split_%d.html'
self.opts = opts
self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.)
self.orig_size = os.stat(content(path)).st_size
self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
root = html.fromstring(open(content(path)).read())
css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
@ -63,6 +64,7 @@ class Splitter(LoggingInterface):
self.find_page_breaks(stylesheet, root)
self.trees = []
self.split_size = 0
self.split(root.getroottree())
self.commit()
self.log_info('\t\tSplit into %d parts.', len(self.trees))
@ -80,7 +82,7 @@ class Splitter(LoggingInterface):
self.log_debug('\t\tSplitting...')
root = tree.getroot()
split_point, before = self.find_split_point(root)
if split_point is None:
if split_point is None:# or self.split_size > 6*self.orig_size:
if not self.always_remove:
self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
raise SplitError(self.path, root)
@ -144,6 +146,7 @@ class Splitter(LoggingInterface):
if size <= self.opts.profile.flow_size:
self.trees.append(t)
self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.)
self.split_size += size
else:
self.split(t)

View File

@ -350,6 +350,8 @@ class MobiReader(object):
pos = 0
self.processed_html = ''
for end in positions:
if end == 0:
continue
oend = end
l = self.mobi_html.find('<', end)
r = self.mobi_html.find('>', end)