Fix #1065 (EPUB Conversion Error)

2025-11-16 03:23:03 -05:00 · 2008-09-26 09:25:30 -07:00 · 2008-09-26 09:25:30 -07:00 · ca806a09c3
commit ca806a09c3
parent 0b8168258a
2 changed files with 7 additions and 12 deletions
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''

-import os, math, copy, logging, functools
+import os, math, copy, logging, functools, collections

 from lxml.etree import XPath as _XPath
 from lxml import etree, html
@ -234,7 +234,7 @@ class Splitter(LoggingInterface):
        all anchors in the original tree. Internal links are re-directed. The
        original file is deleted and the split files are saved.
        '''
-        self.anchor_map = {None:self.base%0}
+        self.anchor_map = collections.defaultdict(lambda :self.base%0)
        self.files = []
        
        for i, tree in enumerate(self.trees):
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -252,15 +252,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):

 class PreProcessor(object):
    PREPROCESS = []
-    # Fix Baen markup
-    BAEN = [ 
-                     (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), 
-                      lambda match: match.group(1)),
-                     (re.compile(r'<p>\s*(<a id.*?>\s*</a>)\s*</p>', re.IGNORECASE), 
-                      lambda match: match.group(1)),
-                     (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE), 
-                      lambda match: ''),
-                     ]
+                     
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
@ -275,6 +267,9 @@ class PreProcessor(object):
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),
                  
+                  # Remove gray background
+                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>')
+                  
                  ]
    
    # Fix Book Designer markup
@ -305,7 +300,7 @@ class PreProcessor(object):
                          
    def preprocess(self, html):
        if self.is_baen(html):
-            rules = self.BAEN
+            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):