From 5596f506a7a511eea83f3dad86e93ac87fb9f757 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 1 Feb 2011 01:51:22 +0800
Subject: [PATCH] improved scene break/whitespace formatting
---
src/calibre/ebooks/conversion/utils.py | 60 +++++++++++++++++++-------
1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 74afbe7a42..77086efd97 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -30,6 +30,9 @@ class HeuristicProcessor(object):
self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}(?!\s*]*>\s*
){2,}', re.IGNORECASE)
+ self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ self.line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ self.single_blank = re.compile(r'(\s*]*>\s*
)', re.IGNORECASE)
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -188,19 +191,17 @@ class HeuristicProcessor(object):
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
- chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ chapter_line_open = self.line_open
title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
title_header_open = r"(?P"
chapter_header_close = ")\s*"
title_header_close = ")"
- chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ chapter_line_close = self.line_close
title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
- chapter_line_open = "<(?Pp)[^>]*>(\s*<[ibu][^>]*>)?\s*"
- chapter_line_close = "\s*([ibu][^>]*>\s*)?(?P=outer)>"
title_line_open = "<(?Pp)[^>]*>\s*"
title_line_close = "\s*(?P=outer2)>"
@@ -382,6 +383,8 @@ class HeuristicProcessor(object):
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html)
+ # Empty heading tags
+ html = re.sub(r'(?i)\s*', '', html)
self.deleted_nbsps = True
return html
@@ -421,13 +424,12 @@ class HeuristicProcessor(object):
return False
def merge_blanks(self, html, blanks_count=None):
- single_blank = re.compile(r'(\s*]*>\s*
)', re.IGNORECASE)
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
em_per_line = 1.5 # Add another 1.5 em for each additional blank
def merge_matches(match):
to_merge = match.group(0)
- lines = float(len(single_blank.findall(to_merge))) - 1.
+ lines = float(len(self.single_blank.findall(to_merge))) - 1.
em = base_em + (em_per_line * lines)
if to_merge.find('whitespace'):
newline = self.any_multi_blank.sub('\n
', match.group(0))
@@ -439,17 +441,37 @@ class HeuristicProcessor(object):
return html
def detect_whitespace(self, html):
- blanks_before_headings = re.compile(r'(\s*]*>\s*
){1,}(?=\s*)(\s*]*>\s*
){1,}', re.IGNORECASE)
+ blanks_around_headings = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE)
+ blanks_n_nopunct = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE)
+ def merge_header_whitespace(match):
+ initblanks = match.group('initparas')
+ endblanks = match.group('initparas')
+ heading = match.group('heading')
+ top_margin = ''
+ bottom_margin = ''
+ if initblanks is not None:
+ top_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+ if endblanks is not None:
+ bottom_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+
+ if initblanks == None and endblanks == None:
+ return heading
+ else:
+ heading = re.sub('(?i)\d+)[^>]*>', ''+' style="'+top_margin+bottom_margin+'">', heading)
+ return heading
+
+ html = blanks_around_headings.sub(merge_header_whitespace, html)
+
def markup_whitespaces(match):
- blanks = match.group(0)
- blanks = self.blankreg.sub('\n
', blanks)
- return blanks
- html = blanks_before_headings.sub(markup_whitespaces, html)
- html = blanks_after_headings.sub(markup_whitespaces, html)
+ blanks = match.group(0)
+ blanks = self.blankreg.sub('\n
', blanks)
+ return blanks
+
+ html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=, change to empty paragraphs
#html = re.sub('
]*>', u'\u00a0
', html)
@@ -558,7 +585,7 @@ class HeuristicProcessor(object):
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
self.log.debug("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections))
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
if getattr(self.extra_opts, 'renumber_headings', False):
@@ -579,9 +606,10 @@ class HeuristicProcessor(object):
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
# Center separator lines, use a bit larger margin in this case
- scene_break = re.compile(r'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', re.IGNORECASE|re.UNICODE)
+ scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+ scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
print "found "+str(len(scene_break.findall(html)))+" scene breaks"
- html = scene_break.sub('' + '\g' + '
', html)
+ html = scene_break.sub('' + '\g' + '
', html)
#html = re.sub(']*>\s*
', '
', html)
if self.deleted_nbsps: