From 0ad1f3c088f2ff0872de49171fd99a91a50a031a Mon Sep 17 00:00:00 2001
From: ldolse
Date: Wed, 25 Aug 2010 10:49:42 +1000
Subject: [PATCH] preprocessing regex tweaks
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/rtf/input.py | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da652c1a38..940c27344b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -209,7 +209,7 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index dcffbe68ca..eaba28e429 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,12 +231,12 @@ class RTFInput(InputFormatPlugin):
if self.options.preprocess_html:
print "********* Preprocessing HTML *********\n"
# Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)\s*\s*
', re.IGNORECASE)
+ chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(<(/i|b)>)?)?)\s*\s*
', re.IGNORECASE)
res = chapdetect.sub(''+'\g'+'
\n', res)
# Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', res, 0.4)
print "*** Median length is " + str(length) + " ***\n"
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*(
)?\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE)
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*
\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*]*>\s*" % length, re.UNICODE)
if length < 150:
res = unwrap.sub(' ', res)
f.write(res)