diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 33c6360495..38f51ae3be 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en' Split the flows in an epub file to conform to size limitations. ''' -import os, math, copy, logging, functools +import os, math, copy, logging, functools, collections from lxml.etree import XPath as _XPath from lxml import etree, html @@ -234,7 +234,7 @@ class Splitter(LoggingInterface): all anchors in the original tree. Internal links are re-directed. The original file is deleted and the split files are saved. ''' - self.anchor_map = {None:self.base%0} + self.anchor_map = collections.defaultdict(lambda :self.base%0) self.files = [] for i, tree in enumerate(self.trees): diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 147e4a8a79..1433976113 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -252,15 +252,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): class PreProcessor(object): PREPROCESS = [] - # Fix Baen markup - BAEN = [ - (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), - lambda match: match.group(1)), - (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), - lambda match: match.group(1)), - (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), - lambda match: ''), - ] + # Fix pdftohtml markup PDFTOHTML = [ # Remove
tags @@ -275,6 +267,9 @@ class PreProcessor(object): # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), + # Remove gray background + (re.compile(r']+>'), lambda match : '') + ] # Fix Book Designer markup @@ -305,7 +300,7 @@ class PreProcessor(object): def preprocess(self, html): if self.is_baen(html): - rules = self.BAEN + rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html):