From 0d07ad2610b8b58d237075392353fb35e45d2ae7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 25 Apr 2009 14:12:23 -0700 Subject: [PATCH] Strip 0 bytes from HTML before parsing --- src/calibre/ebooks/conversion/preprocess.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index fb55ee74fb..42e6654127 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,9 +26,9 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '

'+chap+'


' - else: + else: return '

'+chap+'
'+title+'


' @@ -49,19 +49,19 @@ def line_length(raw, percent): total = sum(lengths) avg = total / len(lengths) max_line = avg * 2 - + lengths = sorted(lengths) for i in range(len(lengths) - 1, -1, -1): if lengths[i] > max_line: del lengths[i] - + if percent > 1: percent = 1 if percent < 0: percent = 0 index = int(len(lengths) * percent) - 1 - + return lengths[index] @@ -110,17 +110,17 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + # Detect Chapters to match default XPATH in GUI (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), - + # Un wrap lines (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), - + # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -157,6 +157,7 @@ class HTMLPreProcessor(object): def __call__(self, html, remove_special_chars=None): if remove_special_chars is not None: html = remove_special_chars.sub('', html) + html = html.replace('\0', '') if self.is_baen(html): rules = [] elif self.is_book_designer(html): @@ -166,7 +167,7 @@ class HTMLPreProcessor(object): #line_length_rules = [ # (re.compile('%i' % line_length(html, .85)), lambda match:) #] - + rules = self.PDFTOHTML # + line_length_rules else: rules = []