Fix #1065 (EPUB Conversion Error)

This commit is contained in:
Kovid Goyal 2008-09-26 09:25:30 -07:00
parent 0b8168258a
commit ca806a09c3
2 changed files with 7 additions and 12 deletions

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
Split the flows in an epub file to conform to size limitations.
'''
import os, math, copy, logging, functools
import os, math, copy, logging, functools, collections
from lxml.etree import XPath as _XPath
from lxml import etree, html
@ -234,7 +234,7 @@ class Splitter(LoggingInterface):
all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved.
'''
self.anchor_map = {None:self.base%0}
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = []
for i, tree in enumerate(self.trees):

View File

@ -252,15 +252,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
class PreProcessor(object):
PREPROCESS = []
# Fix Baen markup
BAEN = [
(re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE),
lambda match: match.group(1)),
(re.compile(r'<p>\s*(<a id.*?>\s*</a>)\s*</p>', re.IGNORECASE),
lambda match: match.group(1)),
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
@ -275,6 +267,9 @@ class PreProcessor(object):
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>')
]
# Fix Book Designer markup
@ -305,7 +300,7 @@ class PreProcessor(object):
def preprocess(self, html):
if self.is_baen(html):
rules = self.BAEN
rules = []
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):