mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Heuristics: Improved Scene break detection and add option to control what scene breaks are replaced by.
This commit is contained in:
commit
1fe6530b8f
@ -46,7 +46,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate', 'renumber_headings']
|
||||
'dehyphenate', 'renumber_headings',
|
||||
'replace_scene_breaks']
|
||||
|
||||
def print_help(parser, log):
|
||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||
|
@ -531,6 +531,11 @@ OptionRecommendation(name='format_scene_breaks',
|
||||
'Replace soft scene breaks that use multiple blank lines with'
|
||||
'horizontal rules.')),
|
||||
|
||||
OptionRecommendation(name='replace_scene_breaks',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
help=_('Replace scene breaks with the specified text. By default, the '
|
||||
'text from the input document is used.')),
|
||||
|
||||
OptionRecommendation(name='dehyphenate',
|
||||
recommended_value=True, level=OptionRecommendation.LOW,
|
||||
help=_('Analyze hyphenated words throughout the document. The '
|
||||
|
@ -26,9 +26,14 @@ class HeuristicProcessor(object):
|
||||
self.blanks_deleted = False
|
||||
self.blanks_between_paragraphs = False
|
||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
|
||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
@ -187,19 +192,17 @@ class HeuristicProcessor(object):
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_line_open = self.line_open
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
chapter_line_close = self.line_close
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||
title_line_close = "\s*</(?P=outer2)>"
|
||||
|
||||
@ -374,13 +377,15 @@ class HeuristicProcessor(object):
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||
# Delete self closing paragraph tags
|
||||
html = re.sub('<p\s?/>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||
# Empty heading tags
|
||||
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||
self.deleted_nbsps = True
|
||||
return html
|
||||
|
||||
@ -419,32 +424,98 @@ class HeuristicProcessor(object):
|
||||
return True
|
||||
return False
|
||||
|
||||
def detect_blank_formatting(self, html):
|
||||
blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
|
||||
blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
|
||||
|
||||
def markup_spacers(match):
|
||||
blanks = match.group(0)
|
||||
blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
|
||||
return blanks
|
||||
html = blanks_before_headings.sub(markup_spacers, html)
|
||||
html = blanks_after_headings.sub(markup_spacers, html)
|
||||
def merge_blanks(self, html, blanks_count=None):
|
||||
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
|
||||
em_per_line = 1.5 # Add another 1.5 em for each additional blank
|
||||
|
||||
def merge_matches(match):
|
||||
to_merge = match.group(0)
|
||||
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||
em = base_em + (em_per_line * lines)
|
||||
if to_merge.find('whitespace'):
|
||||
newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||
else:
|
||||
newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||
return newline
|
||||
|
||||
html = self.any_multi_blank.sub(merge_matches, html)
|
||||
return html
|
||||
|
||||
def detect_whitespace(self, html):
|
||||
blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||
blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||
|
||||
def merge_header_whitespace(match):
|
||||
initblanks = match.group('initparas')
|
||||
endblanks = match.group('initparas')
|
||||
heading = match.group('heading')
|
||||
top_margin = ''
|
||||
bottom_margin = ''
|
||||
if initblanks is not None:
|
||||
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
if endblanks is not None:
|
||||
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
|
||||
if initblanks == None and endblanks == None:
|
||||
return heading
|
||||
else:
|
||||
heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
|
||||
return heading
|
||||
|
||||
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||
|
||||
def markup_whitespaces(match):
|
||||
blanks = match.group(0)
|
||||
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||
return blanks
|
||||
|
||||
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||
if self.html_preprocess_sections > self.min_chapters:
|
||||
html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
|
||||
html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
|
||||
|
||||
return html
|
||||
|
||||
def detect_soft_breaks(self, html):
|
||||
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
else:
|
||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
return html
|
||||
|
||||
def markup_user_break(self, replacement_break):
|
||||
'''
|
||||
Takes string a user supplies and wraps it in markup that will be centered with
|
||||
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
|
||||
a style with width attributes in the <hr> tag then the appropriate margins are
|
||||
applied to wrapping divs. This is because many ebook devices don't support margin:auto
|
||||
All other html is converted to text.
|
||||
'''
|
||||
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
|
||||
if re.findall('(<|>)', replacement_break):
|
||||
if re.match('^<hr', replacement_break):
|
||||
if replacement_break.find('width') != -1:
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
|
||||
divpercent = (100 - width) / 2
|
||||
hr_open = re.sub('45', str(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
else:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
elif re.match('^<img', replacement_break):
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
from calibre.utils.html2text import html2text
|
||||
replacement_break = html2text(replacement_break)
|
||||
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
|
||||
return scene_break
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log.debug("********* Heuristic processing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
try:
|
||||
@ -458,7 +529,7 @@ class HeuristicProcessor(object):
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = self.arrange_htm_line_endings(html)
|
||||
|
||||
#self.dump(html, 'after_arrange_line_endings')
|
||||
if self.cleanup_required():
|
||||
###### Check Markup ######
|
||||
#
|
||||
@ -478,6 +549,11 @@ class HeuristicProcessor(object):
|
||||
# fix indents must run before this step, as it removes non-breaking spaces
|
||||
html = self.cleanup_markup(html)
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||
self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
@ -489,6 +565,7 @@ class HeuristicProcessor(object):
|
||||
|
||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
||||
#self.dump(html, 'after_chapter_markup')
|
||||
|
||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||
html = self.markup_italicis(html)
|
||||
@ -498,7 +575,7 @@ class HeuristicProcessor(object):
|
||||
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||
self.log.debug("deleting blank lines")
|
||||
self.blanks_deleted = True
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
html = self.blankreg.sub('', html)
|
||||
|
||||
# Determine line ending type
|
||||
@ -539,7 +616,7 @@ class HeuristicProcessor(object):
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
@ -549,14 +626,32 @@ class HeuristicProcessor(object):
|
||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||
|
||||
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||
# Multiple sequential blank paragraphs are merged with appropriate margins
|
||||
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
html = self.detect_blank_formatting(html)
|
||||
html = self.detect_whitespace(html)
|
||||
html = self.detect_soft_breaks(html)
|
||||
# Center separator lines
|
||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
|
||||
#html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||
blanks_count = len(self.any_multi_blank.findall(html))
|
||||
if blanks_count >= 1:
|
||||
html = self.merge_blanks(html, blanks_count)
|
||||
scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||
# If the user has enabled scene break replacement, then either softbreaks
|
||||
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||
if replacement_break is not None:
|
||||
replacement_break = self.markup_user_break(replacement_break)
|
||||
if len(scene_break.findall(html)) >= 1:
|
||||
html = scene_break.sub(replacement_break, html)
|
||||
else:
|
||||
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||
html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
return html
|
||||
|
@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
from PyQt4.Qt import Qt
|
||||
|
||||
from calibre.gui2 import gprefs
|
||||
from calibre.gui2.convert.heuristics_ui import Ui_Form
|
||||
from calibre.gui2.convert import Widget
|
||||
|
||||
@ -21,17 +22,35 @@ class HeuristicsWidget(Widget, Ui_Form):
|
||||
['enable_heuristics', 'markup_chapter_headings',
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'delete_blank_paragraphs',
|
||||
'format_scene_breaks', 'replace_scene_breaks',
|
||||
'dehyphenate', 'renumber_headings']
|
||||
)
|
||||
self.db, self.book_id = db, book_id
|
||||
self.rssb_defaults = ['', '<hr />', '* * *']
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
||||
self.load_histories()
|
||||
|
||||
self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
|
||||
self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
|
||||
|
||||
self.enable_heuristics(self.opt_enable_heuristics.checkState())
|
||||
|
||||
def restore_defaults(self, get_option):
|
||||
Widget.restore_defaults(self, get_option)
|
||||
|
||||
rssb_hist = gprefs['replace_scene_breaks_history']
|
||||
for x in self.rssb_defaults:
|
||||
if x in rssb_hist:
|
||||
del rssb_hist[rssb_hist.index(x)]
|
||||
gprefs['replace_scene_breaks_history'] = self.rssb_defaults + gprefs['replace_scene_breaks_history']
|
||||
|
||||
def commit_options(self, save_defaults=False):
|
||||
self.save_histories()
|
||||
|
||||
return Widget.commit_options(self, save_defaults)
|
||||
|
||||
def break_cycles(self):
|
||||
Widget.break_cycles(self)
|
||||
|
||||
@ -45,6 +64,30 @@ class HeuristicsWidget(Widget, Ui_Form):
|
||||
if val is None and g is self.opt_html_unwrap_factor:
|
||||
g.setValue(0.0)
|
||||
return True
|
||||
if not val and g is self.opt_replace_scene_breaks:
|
||||
g.lineEdit().setText('')
|
||||
return True
|
||||
|
||||
def load_histories(self):
|
||||
val = unicode(self.opt_replace_scene_breaks.currentText())
|
||||
rssb_hist = gprefs.get('replace_scene_breaks_history', self.rssb_defaults)
|
||||
if val in rssb_hist:
|
||||
del rssb_hist[rssb_hist.index(val)]
|
||||
rssb_hist.insert(0, val)
|
||||
for v in rssb_hist:
|
||||
# Ensure we don't have duplicate items.
|
||||
if self.opt_replace_scene_breaks.findText(v) == -1:
|
||||
self.opt_replace_scene_breaks.addItem(v)
|
||||
self.opt_replace_scene_breaks.setCurrentIndex(0)
|
||||
|
||||
def save_histories(self):
|
||||
rssb_history = []
|
||||
history_pats = [unicode(self.opt_replace_scene_breaks.lineEdit().text())] + [unicode(self.opt_replace_scene_breaks.itemText(i)) for i in xrange(self.opt_replace_scene_breaks.count())]
|
||||
for p in history_pats[:10]:
|
||||
# Ensure we don't have duplicate items.
|
||||
if p not in rssb_history:
|
||||
rssb_history.append(p)
|
||||
gprefs['replace_scene_breaks_history'] = rssb_history
|
||||
|
||||
def enable_heuristics(self, state):
|
||||
state = state == Qt.Checked
|
||||
|
@ -150,6 +150,45 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<layout class="QHBoxLayout" name="horizontalLayout_2">
|
||||
<property name="sizeConstraint">
|
||||
<enum>QLayout::SetDefaultConstraint</enum>
|
||||
</property>
|
||||
<item>
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Replace soft scene &breaks:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_replace_scene_breaks</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QComboBox" name="opt_replace_scene_breaks">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Expanding" vsizetype="Fixed">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="editable">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="insertPolicy">
|
||||
<enum>QComboBox::InsertAtTop</enum>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_dehyphenate">
|
||||
<property name="text">
|
||||
|
@ -311,10 +311,15 @@ remove all non-breaking-space entities, or may include false positive matches re
|
||||
|
||||
:guilabel:`Ensure scene breaks are consistently formatted`
|
||||
With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.
|
||||
It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
|
||||
page width. Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and
|
||||
thus become difficult to distinguish.
|
||||
'Soft' scene break markers, i.e. scene breaks only defined by extra white space, are styled to ensure that they
|
||||
will not be displayed in conjunction with page breaks.
|
||||
|
||||
:guilabel:`Replace scene breaks`
|
||||
If this option is configured then |app| will replace scene break markers it finds with the replacement text specified by the
|
||||
user. In general you should avoid using html tags, |app| will discard any tags and use pre-defined markup. <hr />
|
||||
tags, i.e. horizontal rules, are an exception. These can optionally be specified with styles, if you choose to add your own
|
||||
style be sure to include the 'width' setting, otherwise the style information will be discarded.
|
||||
|
||||
:guilabel:`Remove unnecessary hyphens`
|
||||
|app| will analyze all hyphenated content in the document when this option is enabled. The document itself is used
|
||||
as a dictionary for analysis. This allows |app| to accurately remove hyphens for any words in the document in any language,
|
||||
@ -628,7 +633,7 @@ between 0 and 1. The default is 0.45, just under the median line length. Lower t
|
||||
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
|
||||
|
||||
Also, they often have headers and footers as part of the document that will become included with the text.
|
||||
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
|
||||
Use the Search and Replace panel to remove headers and footers to mitigate this issue. If the headers and footers are not
|
||||
removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
|
||||
:ref:`regexptutorial`.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user