diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 632a7a3291..b105a6c042 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -23,6 +23,14 @@ def sanitize_head(match):
x = _span_pat.sub('', x)
return '
\n'+x+'\n'
+def chap_head(match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ return '', re.IGNORECASE), lambda match: '
'),
# Remove page numbers
(re.compile(r'\d+
', re.IGNORECASE), lambda match: ''),
- # Remove
and replace
with
+ # Replace
with
(re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
+ # Remove
(re.compile(r'(.*)', re.IGNORECASE),
lambda match: match.group() if \
re.match('<', match.group(1).lstrip()) or \
@@ -69,15 +78,22 @@ class HTMLPreProcessor(object):
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
+ # Detect Chapters to match default XPATH in GUI
+ (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
# Un wrap lines
- (re.compile(r'(?<=\w)\s*(i|b|u)>\s*
\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '),
- (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '),
+ (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*((i|b|u)>)*\s*\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
+
# Clean up spaces
- (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '),
- ]
+ (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
+ # Add space before and after italics
+ (re.compile(r'(?'), lambda match: ' '),
+ (re.compile(r'(?=\w)'), lambda match: ' '),
+ ]
# Fix Book Designer markup
BOOK_DESIGNER = [
diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py
index 4d8516f6c3..a5ee619937 100644
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@@ -98,7 +98,7 @@ def get_cover(stream):
data = cStringIO.StringIO()
try:
- StreamReadWrapper(stream) as stream:
+ with StreamReadWrapper(stream) as stream:
pdf = PdfFileReader(stream)
output = PdfFileWriter()