From d75e17e6b44e8ae688ade08bd30ae552ab0c48c3 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 1 Feb 2011 18:07:37 +0800 Subject: [PATCH] added scene break replacement logic --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 4 +++ src/calibre/ebooks/conversion/utils.py | 33 ++++++++++++++++++++---- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 33ae61f16a..278d599378 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -143,7 +143,7 @@ def add_pipeline_options(parser, plumber): ' patterns. Disabled by default. Use %s to enable. ' ' Individual actions can be disabled with the %s options.') % ('--enable-heuristics', '--disable-*'), - ['enable_heuristics'] + HEURISTIC_OPTIONS + ['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS ), 'SEARCH AND REPLACE' : ( diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 5807ba5f8f..59d7a0ed2a 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -530,6 +530,10 @@ OptionRecommendation(name='format_scene_breaks', help=_('Left aligned scene break markers are center aligned. ' 'Replace soft scene breaks that use multiple blank lines with' 'horizontal rules.')), + +OptionRecommendation(name='replace_scene_breaks', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Replace scene breaks with the specified text.')), OptionRecommendation(name='dehyphenate', recommended_value=True, level=OptionRecommendation.LOW, diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 1263372ce3..cf305f1022 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -33,6 +33,7 @@ class HeuristicProcessor(object): self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) + self.scene_break_open = '

' def is_pdftohtml(self, src): return '' in src[:1000] @@ -481,6 +482,22 @@ class HeuristicProcessor(object): html = self.blankreg.sub('\n

', html) return html + def markup_user_break(self, replacement_break): + hr_open = '
' + if re.findall('(<|>)', replacement_break): + if re.match('^
' + elif re.match('^' + else: + replacement_break = html2text(replacement_break) + replacement_break = re.sub('\s', ' ', replacement_break) + scene_break = self.scene_break_open+replacement_break+'

' + else: + replacement_break = re.sub('\s', ' ', replacement_break) + scene_break = self.scene_break_open+replacement_break+'

' + + return scene_break def __call__(self, html): @@ -498,7 +515,7 @@ class HeuristicProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) - self.dump(html, 'after_arrange_line_endings') + #self.dump(html, 'after_arrange_line_endings') if self.cleanup_required(): ###### Check Markup ###### # @@ -534,7 +551,7 @@ class HeuristicProcessor(object): if getattr(self.extra_opts, 'markup_chapter_headings', False): html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs) - self.dump(html, 'after_chapter_markup') + #self.dump(html, 'after_chapter_markup') if getattr(self.extra_opts, 'italicize_common_cases', False): html = self.markup_italicis(html) @@ -608,9 +625,15 @@ class HeuristicProcessor(object): # Center separator lines, use a bit larger margin in this case scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) - print "found "+str(len(scene_break.findall(html)))+" scene breaks" - html = scene_break.sub('

' + '\g' + '

', html) - #html = re.sub(']*>\s*

', '

', html) + replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None) + if replacement_break is not None: + replacement_break = self.markup_user_break(replacement_break) + if len(scene_break.findall(html)) >= 1: + html = scene_break.sub(replacement_break, html) + else: + html = re.sub(']*>\s*

', replacement_break, html) + else: + html = scene_break.sub(self.scene_break_open+'\g'+'

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly