mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Strip 0 bytes from HTML before parsing
This commit is contained in:
parent
e80fcc13fc
commit
0d07ad2610
@ -26,9 +26,9 @@ def sanitize_head(match):
|
|||||||
def chap_head(match):
|
def chap_head(match):
|
||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
return '<h1>'+chap+'</h1><br/>'
|
return '<h1>'+chap+'</h1><br/>'
|
||||||
else:
|
else:
|
||||||
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
|
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
|
||||||
|
|
||||||
|
|
||||||
@ -49,19 +49,19 @@ def line_length(raw, percent):
|
|||||||
total = sum(lengths)
|
total = sum(lengths)
|
||||||
avg = total / len(lengths)
|
avg = total / len(lengths)
|
||||||
max_line = avg * 2
|
max_line = avg * 2
|
||||||
|
|
||||||
lengths = sorted(lengths)
|
lengths = sorted(lengths)
|
||||||
for i in range(len(lengths) - 1, -1, -1):
|
for i in range(len(lengths) - 1, -1, -1):
|
||||||
if lengths[i] > max_line:
|
if lengths[i] > max_line:
|
||||||
del lengths[i]
|
del lengths[i]
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
if percent < 0:
|
if percent < 0:
|
||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
index = int(len(lengths) * percent) - 1
|
index = int(len(lengths) * percent) - 1
|
||||||
|
|
||||||
return lengths[index]
|
return lengths[index]
|
||||||
|
|
||||||
|
|
||||||
@ -110,17 +110,17 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
# Remove non breaking spaces
|
# Remove non breaking spaces
|
||||||
(re.compile(ur'\u00a0'), lambda match : ' '),
|
(re.compile(ur'\u00a0'), lambda match : ' '),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
|
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
|
||||||
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
|
|
||||||
# Un wrap lines
|
# Un wrap lines
|
||||||
(re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
|
(re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
|
||||||
|
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
@ -157,6 +157,7 @@ class HTMLPreProcessor(object):
|
|||||||
def __call__(self, html, remove_special_chars=None):
|
def __call__(self, html, remove_special_chars=None):
|
||||||
if remove_special_chars is not None:
|
if remove_special_chars is not None:
|
||||||
html = remove_special_chars.sub('', html)
|
html = remove_special_chars.sub('', html)
|
||||||
|
html = html.replace('\0', '')
|
||||||
if self.is_baen(html):
|
if self.is_baen(html):
|
||||||
rules = []
|
rules = []
|
||||||
elif self.is_book_designer(html):
|
elif self.is_book_designer(html):
|
||||||
@ -166,7 +167,7 @@ class HTMLPreProcessor(object):
|
|||||||
#line_length_rules = [
|
#line_length_rules = [
|
||||||
# (re.compile('%i' % line_length(html, .85)), lambda match:)
|
# (re.compile('%i' % line_length(html, .85)), lambda match:)
|
||||||
#]
|
#]
|
||||||
|
|
||||||
rules = self.PDFTOHTML # + line_length_rules
|
rules = self.PDFTOHTML # + line_length_rules
|
||||||
else:
|
else:
|
||||||
rules = []
|
rules = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user