diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bbbc96a7a5..7e5873edd2 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,6 +62,26 @@ def wrap_lines(match): else: return ital+' ' +def smarten_punctuation(html, log): + from calibre.utils.smartypants import smartyPants + from calibre.ebooks.chardet import substitute_entites + from calibre.ebooks.conversion.utils import HeuristicProcessor + preprocessor = HeuristicProcessor(log=log) + from uuid import uuid4 + start = 'calibre-smartypants-'+str(uuid4()) + stop = 'calibre-smartypants-'+str(uuid4()) + html = html.replace('', stop) + html = preprocessor.fix_nbsp_indents(html) + html = smartyPants(html) + html = html.replace(start, '') + # convert ellipsis to entities to prevent wrapping + html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + # convert double dashes to em-dash + html = re.sub(r'\s--\s', u'\u2014', html) + return substitute_entites(html) + class DocAnalysis(object): ''' Provides various text analysis functions to determine how the document is structured. @@ -638,7 +658,7 @@ class HTMLPreProcessor(object): html = preprocessor(html) if getattr(self.extra_opts, 'smarten_punctuation', False): - html = self.smarten_punctuation(html) + html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars @@ -653,23 +673,4 @@ class HTMLPreProcessor(object): return html - def smarten_punctuation(self, html): - from calibre.utils.smartypants import smartyPants - from calibre.ebooks.chardet import substitute_entites - from calibre.ebooks.conversion.utils import HeuristicProcessor - preprocessor = HeuristicProcessor(self.extra_opts, self.log) - from uuid import uuid4 - start = 'calibre-smartypants-'+str(uuid4()) - stop = 'calibre-smartypants-'+str(uuid4()) - html = html.replace('', stop) - html = preprocessor.fix_nbsp_indents(html) - html = smartyPants(html) - html = html.replace(start, '') - # convert ellipsis to entities to prevent wrapping - html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) - # convert double dashes to em-dash - html = re.sub(r'\s--\s', u'\u2014', html) - return substitute_entites(html) diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py index 0ac8f47ce5..7a2439c4ec 100644 --- a/src/calibre/ebooks/oeb/polish/main.py +++ b/src/calibre/ebooks/oeb/polish/main.py @@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container from calibre.ebooks.oeb.polish.stats import StatsCollector from calibre.ebooks.oeb.polish.subset import subset_all_fonts from calibre.ebooks.oeb.polish.cover import set_cover +from calibre.ebooks.oeb.polish.replace import smarten_punctuation from calibre.ebooks.oeb.polish.jacket import ( replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket) from calibre.utils.logging import Log @@ -25,6 +26,7 @@ ALL_OPTS = { 'cover': None, 'jacket': False, 'remove_jacket':False, + 'smarten_punctuation':False, } SUPPORTED = {'EPUB', 'AZW3'} @@ -72,6 +74,13 @@ etc.
'''), 'remove_jacket': _('''\Remove a previous inserted book jacket page.
'''), + +'smarten_punctuation': _('''\ +Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their +typographically correct equivalents.
+Note that the algorithm can sometimes generate incorrect results, especially +when single quotes at the start of contractions are involved.
+'''), } def hfix(name, raw): @@ -121,11 +130,6 @@ def polish(file_map, opts, log, report): report(_('Updated metadata jacket')) report(_('Metadata updated\n')) - if opts.subset: - rt(_('Subsetting embedded fonts')) - subset_all_fonts(ebook, stats.font_stats, report) - report('') - if opts.cover: rt(_('Setting cover')) set_cover(ebook, opts.cover, report) @@ -150,6 +154,16 @@ def polish(file_map, opts, log, report): report(_('No metadata jacket found')) report('') + if opts.smarten_punctuation: + rt(_('Smartening punctuation')) + smarten_punctuation(ebook, report) + report('') + + if opts.subset: + rt(_('Subsetting embedded fonts')) + subset_all_fonts(ebook, stats.font_stats, report) + report('') + ebook.commit(outbook) report('-'*70) report(_('Polishing took: %.1f seconds')%(time.time()-st)) @@ -190,6 +204,7 @@ def option_parser(): 'Path to an OPF file. The metadata in the book is updated from the OPF file.')) o('--jacket', '-j', help=CLI_HELP['jacket']) o('--remove-jacket', help=CLI_HELP['remove_jacket']) + o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation']) o('--verbose', help=_('Produce more verbose output, useful for debugging.')) diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py index 0e2f672d42..b26589d5bf 100644 --- a/src/calibre/ebooks/oeb/polish/replace.py +++ b/src/calibre/ebooks/oeb/polish/replace.py @@ -7,10 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid GoyalThis will update all metadata and covers in the' ' ebook files to match the current metadata in the' @@ -61,6 +64,7 @@ class Polish(QDialog): # {{{ count = 0 self.all_actions = OrderedDict([ ('subset', _('Subset all embedded fonts')), + ('smarten_punctuation', _('Smarten punctuation')), ('metadata', _('Update metadata in book files')), ('jacket', _('Add metadata as a "book jacket" page')), ('remove_jacket', _('Remove a previously inserted book jacket')),