From d75e17e6b44e8ae688ade08bd30ae552ab0c48c3 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 1 Feb 2011 18:07:37 +0800
Subject: [PATCH] added scene break replacement logic
---
src/calibre/ebooks/conversion/cli.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 4 +++
src/calibre/ebooks/conversion/utils.py | 33 ++++++++++++++++++++----
3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 33ae61f16a..278d599378 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -143,7 +143,7 @@ def add_pipeline_options(parser, plumber):
' patterns. Disabled by default. Use %s to enable. '
' Individual actions can be disabled with the %s options.')
% ('--enable-heuristics', '--disable-*'),
- ['enable_heuristics'] + HEURISTIC_OPTIONS
+ ['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS
),
'SEARCH AND REPLACE' : (
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 5807ba5f8f..59d7a0ed2a 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -530,6 +530,10 @@ OptionRecommendation(name='format_scene_breaks',
help=_('Left aligned scene break markers are center aligned. '
'Replace soft scene breaks that use multiple blank lines with'
'horizontal rules.')),
+
+OptionRecommendation(name='replace_scene_breaks',
+ recommended_value=None, level=OptionRecommendation.LOW,
+ help=_('Replace scene breaks with the specified text.')),
OptionRecommendation(name='dehyphenate',
recommended_value=True, level=OptionRecommendation.LOW,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1263372ce3..cf305f1022 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -33,6 +33,7 @@ class HeuristicProcessor(object):
self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
self.line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
self.single_blank = re.compile(r'(\s*]*>\s*
)', re.IGNORECASE)
+ self.scene_break_open = ''
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -481,6 +482,22 @@ class HeuristicProcessor(object):
html = self.blankreg.sub('\n
', html)
return html
+ def markup_user_break(self, replacement_break):
+ hr_open = ''
+ if re.findall('(<|>)', replacement_break):
+ if re.match('^
'
+ elif re.match('^
'
+ else:
+ replacement_break = html2text(replacement_break)
+ replacement_break = re.sub('\s', ' ', replacement_break)
+ scene_break = self.scene_break_open+replacement_break+''
+ else:
+ replacement_break = re.sub('\s', ' ', replacement_break)
+ scene_break = self.scene_break_open+replacement_break+''
+
+ return scene_break
def __call__(self, html):
@@ -498,7 +515,7 @@ class HeuristicProcessor(object):
# Arrange line feeds and tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
- self.dump(html, 'after_arrange_line_endings')
+ #self.dump(html, 'after_arrange_line_endings')
if self.cleanup_required():
###### Check Markup ######
#
@@ -534,7 +551,7 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'markup_chapter_headings', False):
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
- self.dump(html, 'after_chapter_markup')
+ #self.dump(html, 'after_chapter_markup')
if getattr(self.extra_opts, 'italicize_common_cases', False):
html = self.markup_italicis(html)
@@ -608,9 +625,15 @@ class HeuristicProcessor(object):
# Center separator lines, use a bit larger margin in this case
scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
- print "found "+str(len(scene_break.findall(html)))+" scene breaks"
- html = scene_break.sub('' + '\g' + '
', html)
- #html = re.sub(']*>\s*
', '
', html)
+ replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
+ if replacement_break is not None:
+ replacement_break = self.markup_user_break(replacement_break)
+ if len(scene_break.findall(html)) >= 1:
+ html = scene_break.sub(replacement_break, html)
+ else:
+ html = re.sub(']*>\s*
', replacement_break, html)
+ else:
+ html = scene_break.sub(self.scene_break_open+'\g'+'', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly