diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index fb55ee74fb..0421534f65 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -31,6 +31,12 @@ def chap_head(match): else: return '

'+chap+'
'+title+'


' +def wrap_lines(match): + ital = match.group('ital') + if not ital: + return ' ' + else: + return ital+' ' def line_length(raw, percent): ''' @@ -93,17 +99,11 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Remove page numbers - (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - # Remove
- (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if \ - re.match('<', match.group(1).lstrip()) or \ - len(match.group(1)) < 40 else match.group(1)), + # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + (re.compile(r'-\n\r?'), lambda match: ''), # Remove gray background (re.compile(r']+>'), lambda match : ''), @@ -112,15 +112,12 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), - (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), - # Un wrap lines - (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), - # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -162,12 +159,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - # Add rules that require matching line length here - #line_length_rules = [ - # (re.compile('%i' % line_length(html, .85)), lambda match:) - #] + line_length_rules = [ + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + ] - rules = self.PDFTOHTML # + line_length_rules + rules = self.PDFTOHTML + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: