Strip 0 bytes from HTML before parsing

2025-11-19 04:53:05 -05:00 · 2009-04-25 14:12:23 -07:00 · 2009-04-25 14:12:23 -07:00 · 0d07ad2610
commit 0d07ad2610
parent e80fcc13fc
1 changed files with 11 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -26,9 +26,9 @@ def sanitize_head(match):
 def chap_head(match):
    chap = match.group('chap')
    title = match.group('title')
-    if not title: 
+    if not title:
               return '<h1>'+chap+'</h1><br/>'
-    else: 
+    else:
               return '<h1>'+chap+'<br/>'+title+'</h1><br/>'


@ -49,19 +49,19 @@ def line_length(raw, percent):
    total = sum(lengths)
    avg = total / len(lengths)
    max_line = avg * 2
-    
+
    lengths = sorted(lengths)
    for i in range(len(lengths) - 1, -1, -1):
        if lengths[i] > max_line:
            del lengths[i]
-    
+
    if percent > 1:
        percent = 1
    if percent < 0:
        percent = 0

    index = int(len(lengths) * percent) - 1
-    
+
    return lengths[index]


@ -110,17 +110,17 @@ class HTMLPreProcessor(object):

                  # Remove non breaking spaces
                  (re.compile(ur'\u00a0'), lambda match : ' '),
-                  
+
                  # Detect Chapters to match default XPATH in GUI
                  (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
                  (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
- 
+
                  # Have paragraphs show better
                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
-                  
+
                  # Un wrap lines
                  (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
-                  
+
                  # Clean up spaces
                  (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                  # Add space before and after italics
@ -157,6 +157,7 @@ class HTMLPreProcessor(object):
    def __call__(self, html, remove_special_chars=None):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
+        html = html.replace('\0', '')
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
@ -166,7 +167,7 @@ class HTMLPreProcessor(object):
            #line_length_rules = [
            #    (re.compile('%i' % line_length(html, .85)), lambda match:)
            #]
-            
+
            rules = self.PDFTOHTML # + line_length_rules
        else:
            rules = []