From cdf61487ffaa8d6caf0037680bd4d7c4901ab066 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 25 Jan 2011 17:29:55 +0800 Subject: [PATCH 1/4] add strong tags to list of empty tags removed during cleanup --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index f6e259b6f9..d4b03b8d55 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -375,9 +375,9 @@ class HeuristicProcessor(object): html = re.sub('', '', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) - html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) - html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) self.deleted_nbsps = True return html From 5c988788a05aec3284b29268dad1a4748cd89948 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 29 Jan 2011 19:06:02 +0800 Subject: [PATCH 2/4] started scene break detection in heuristics, improved text conversion of Sigil compatible chapter titles --- src/calibre/ebooks/conversion/utils.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 683eaac6d2..786bb79bae 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -27,7 +27,7 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*' in src[:1000] @@ -42,8 +42,10 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: - txt_chap = html2text(chap) - txt_title = html2text(title) + delete_whitespace = re.compile('^\s*(?P.*?)\s*$') + delete_quotes = re.compile('\'\"') + txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(chap))) + txt_title = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(title))) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) @@ -416,6 +418,12 @@ class HeuristicProcessor(object): return True return False + def detect_blank_formatting(self, html): + blanks_before_headings = re.compile(r'(\s*]*>\s*

){2,}(?=\s*'+'\n'+'', html) if getattr(self.extra_opts, 'format_scene_breaks', False): + html = self.detect_blank_formatting(html) + html = self.detect_soft_breaks(html) # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) if not self.blanks_deleted: From bace283325092cb5757d6554fced9db27dfbbaca Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 29 Jan 2011 21:15:44 +0800 Subject: [PATCH 3/4] initial scene break detection --- src/calibre/ebooks/conversion/utils.py | 40 +++++++++++++++++--------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 786bb79bae..957950ec29 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -24,9 +24,10 @@ class HeuristicProcessor(object): self.chapters_no_title = 0 self.chapters_with_title = 0 self.blanks_deleted = False + self.blanks_between_paragraphs = False self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*]*>\s*

){2,}(?=\s*]*>\s*

){1,}(?=\s*)(\s*]*>\s*

){1,}', re.IGNORECASE) + + def markup_spacers(match): + blanks = match.group(0) + blanks = self.blankreg.sub('\n

', blanks) + return blanks + html = blanks_before_headings.sub(markup_spacers, html) + html = blanks_after_headings.sub(markup_spacers, html) + if self.html_preprocess_sections > self.min_chapters: + html = re.sub('(?si)^.*?(?=

', html) + else: + html = self.blankreg.sub('\n

', html) return html + + def __call__(self, html): self.log.debug("********* Heuristic processing HTML *********") @@ -465,25 +482,23 @@ class HeuristicProcessor(object): #html = re.sub(']*>', u'

\u00a0

', html) # Determine whether the document uses interleaved blank lines - blanks_between_paragraphs = self.analyze_blanks(html) + self.blanks_between_paragraphs = self.analyze_blanks(html) #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic if getattr(self.extra_opts, 'markup_chapter_headings', False): - html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs) - - self.dump(html, 'after_chapter_markup') + html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs) if getattr(self.extra_opts, 'italicize_common_cases', False): html = self.markup_italicis(html) # If more than 40% of the lines are empty paragraphs and the user has enabled delete # blank paragraphs then delete blank lines to clean up spacing - if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): + if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -538,13 +553,10 @@ class HeuristicProcessor(object): html = self.detect_blank_formatting(html) html = self.detect_soft_breaks(html) # Center separator lines - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) - if not self.blanks_deleted: - html = self.multi_blank.sub('\n

', html) + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting - html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) - html = self.softbreak.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) + html = self.anyblank.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html From c98a122539751fc4fd56d4f32ee7bd4e06313f8b Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 29 Jan 2011 21:53:53 +0800 Subject: [PATCH 4/4] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 957950ec29..5beefb5bd9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -554,7 +554,7 @@ class HeuristicProcessor(object): html = self.detect_soft_breaks(html) # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) - html = re.sub(']*>\s*

', '

', html) + #html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting