From cdf61487ffaa8d6caf0037680bd4d7c4901ab066 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 25 Jan 2011 17:29:55 +0800
Subject: [PATCH 1/4] add strong tags to list of empty tags removed during
cleanup
---
src/calibre/ebooks/conversion/utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f6e259b6f9..d4b03b8d55 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -375,9 +375,9 @@ class HeuristicProcessor(object):
html = re.sub('', '', html)
# Get rid of empty span, bold, font, em, & italics tags
html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
- html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*(font|[ibu]|em)>\s*){0,2}\s*(font|[ibu]|em)>", " ", html)
+ html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
- html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*(font|[ibu]|em)>\s*){0,2}\s*(font|[ibu]|em)>", " ", html)
+ html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html)
self.deleted_nbsps = True
return html
From 5c988788a05aec3284b29268dad1a4748cd89948 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 29 Jan 2011 19:06:02 +0800
Subject: [PATCH 2/4] started scene break detection in heuristics, improved
text conversion of Sigil compatible chapter titles
---
src/calibre/ebooks/conversion/utils.py | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 683eaac6d2..786bb79bae 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -27,7 +27,7 @@ class HeuristicProcessor(object):
self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
- self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}', re.IGNORECASE)
+ self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}(?!\s*' in src[:1000]
@@ -42,8 +42,10 @@ class HeuristicProcessor(object):
" chapters. - " + unicode(chap))
return ''+chap+'
\n'
else:
- txt_chap = html2text(chap)
- txt_title = html2text(title)
+ delete_whitespace = re.compile('^\s*(?P.*?)\s*$')
+ delete_quotes = re.compile('\'\"')
+ txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(chap)))
+ txt_title = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
@@ -416,6 +418,12 @@ class HeuristicProcessor(object):
return True
return False
+ def detect_blank_formatting(self, html):
+ blanks_before_headings = re.compile(r'(\s*]*>\s*
){2,}(?=\s*'+'\n'+'
', html)
if getattr(self.extra_opts, 'format_scene_breaks', False):
+ html = self.detect_blank_formatting(html)
+ html = self.detect_soft_breaks(html)
# Center separator lines
html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
if not self.blanks_deleted:
From bace283325092cb5757d6554fced9db27dfbbaca Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 29 Jan 2011 21:15:44 +0800
Subject: [PATCH 3/4] initial scene break detection
---
src/calibre/ebooks/conversion/utils.py | 40 +++++++++++++++++---------
1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 786bb79bae..957950ec29 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -24,9 +24,10 @@ class HeuristicProcessor(object):
self.chapters_no_title = 0
self.chapters_with_title = 0
self.blanks_deleted = False
+ self.blanks_between_paragraphs = False
self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
- self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
- self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
+ self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
+ self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}(?!\s*]*>\s*
){2,}(?=\s*]*>\s*
){1,}(?=\s*)(\s*]*>\s*
){1,}', re.IGNORECASE)
+
+ def markup_spacers(match):
+ blanks = match.group(0)
+ blanks = self.blankreg.sub('\n
', blanks)
+ return blanks
+ html = blanks_before_headings.sub(markup_spacers, html)
+ html = blanks_after_headings.sub(markup_spacers, html)
+ if self.html_preprocess_sections > self.min_chapters:
+ html = re.sub('(?si)^.*?(?=
', html)
+ else:
+ html = self.blankreg.sub('\n
', html)
return html
+
+
def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********")
@@ -465,25 +482,23 @@ class HeuristicProcessor(object):
#html = re.sub('
]*>', u'\u00a0
', html)
# Determine whether the document uses interleaved blank lines
- blanks_between_paragraphs = self.analyze_blanks(html)
+ self.blanks_between_paragraphs = self.analyze_blanks(html)
#self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic
if getattr(self.extra_opts, 'markup_chapter_headings', False):
- html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
-
- self.dump(html, 'after_chapter_markup')
+ html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
if getattr(self.extra_opts, 'italicize_common_cases', False):
html = self.markup_italicis(html)
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
# blank paragraphs then delete blank lines to clean up spacing
- if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+ if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
self.log.debug("deleting blank lines")
self.blanks_deleted = True
- html = self.multi_blank.sub('\n
', html)
+ html = self.multi_blank.sub('\n
', html)
html = self.blankreg.sub('', html)
# Determine line ending type
@@ -538,13 +553,10 @@ class HeuristicProcessor(object):
html = self.detect_blank_formatting(html)
html = self.detect_soft_breaks(html)
# Center separator lines
- html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
- if not self.blanks_deleted:
- html = self.multi_blank.sub('\n
', html)
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
html = re.sub(']*>\s*
', '
', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs to preserve original formatting
- html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
- html = self.softbreak.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
+ html = self.anyblank.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
return html
From c98a122539751fc4fd56d4f32ee7bd4e06313f8b Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 29 Jan 2011 21:53:53 +0800
Subject: [PATCH 4/4] ...
---
src/calibre/ebooks/conversion/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 957950ec29..5beefb5bd9 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -554,7 +554,7 @@ class HeuristicProcessor(object):
html = self.detect_soft_breaks(html)
# Center separator lines
html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
- html = re.sub(']*>\s*
', '
', html)
+ #html = re.sub(']*>\s*
', '
', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs to preserve original formatting