Strip 0 bytes from HTML before parsing

This commit is contained in:
Kovid Goyal 2009-04-25 14:12:23 -07:00
parent e80fcc13fc
commit 0d07ad2610

View File

@ -26,9 +26,9 @@ def sanitize_head(match):
def chap_head(match): def chap_head(match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
return '<h1>'+chap+'</h1><br/>' return '<h1>'+chap+'</h1><br/>'
else: else:
return '<h1>'+chap+'<br/>'+title+'</h1><br/>' return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
@ -49,19 +49,19 @@ def line_length(raw, percent):
total = sum(lengths) total = sum(lengths)
avg = total / len(lengths) avg = total / len(lengths)
max_line = avg * 2 max_line = avg * 2
lengths = sorted(lengths) lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1): for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line: if lengths[i] > max_line:
del lengths[i] del lengths[i]
if percent > 1: if percent > 1:
percent = 1 percent = 1
if percent < 0: if percent < 0:
percent = 0 percent = 0
index = int(len(lengths) * percent) - 1 index = int(len(lengths) * percent) - 1
return lengths[index] return lengths[index]
@ -110,17 +110,17 @@ class HTMLPreProcessor(object):
# Remove non breaking spaces # Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '), (re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI # Detect Chapters to match default XPATH in GUI
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
# Have paragraphs show better # Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'), (re.compile(r'<br.*?>'), lambda match : '<p>'),
# Un wrap lines # Un wrap lines
(re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics # Add space before and after italics
@ -157,6 +157,7 @@ class HTMLPreProcessor(object):
def __call__(self, html, remove_special_chars=None): def __call__(self, html, remove_special_chars=None):
if remove_special_chars is not None: if remove_special_chars is not None:
html = remove_special_chars.sub('', html) html = remove_special_chars.sub('', html)
html = html.replace('\0', '')
if self.is_baen(html): if self.is_baen(html):
rules = [] rules = []
elif self.is_book_designer(html): elif self.is_book_designer(html):
@ -166,7 +167,7 @@ class HTMLPreProcessor(object):
#line_length_rules = [ #line_length_rules = [
# (re.compile('%i' % line_length(html, .85)), lambda match:) # (re.compile('%i' % line_length(html, .85)), lambda match:)
#] #]
rules = self.PDFTOHTML # + line_length_rules rules = self.PDFTOHTML # + line_length_rules
else: else:
rules = [] rules = []