From 25fd859df6d0d2acd0fb20eb5a52cfb875e058ce Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Feb 2013 14:13:45 +0530 Subject: [PATCH] Book polishing: Add an option to smarten punctuation in the book when polishing --- src/calibre/ebooks/conversion/preprocess.py | 41 +++++++++++---------- src/calibre/ebooks/oeb/polish/main.py | 25 ++++++++++--- src/calibre/ebooks/oeb/polish/replace.py | 24 ++++++++++++ src/calibre/gui2/actions/polish.py | 4 ++ 4 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bbbc96a7a5..7e5873edd2 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,6 +62,26 @@ def wrap_lines(match): else: return ital+' ' +def smarten_punctuation(html, log): + from calibre.utils.smartypants import smartyPants + from calibre.ebooks.chardet import substitute_entites + from calibre.ebooks.conversion.utils import HeuristicProcessor + preprocessor = HeuristicProcessor(log=log) + from uuid import uuid4 + start = 'calibre-smartypants-'+str(uuid4()) + stop = 'calibre-smartypants-'+str(uuid4()) + html = html.replace('', stop) + html = preprocessor.fix_nbsp_indents(html) + html = smartyPants(html) + html = html.replace(start, '') + # convert ellipsis to entities to prevent wrapping + html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + # convert double dashes to em-dash + html = re.sub(r'\s--\s', u'\u2014', html) + return substitute_entites(html) + class DocAnalysis(object): ''' Provides various text analysis functions to determine how the document is structured. @@ -638,7 +658,7 @@ class HTMLPreProcessor(object): html = preprocessor(html) if getattr(self.extra_opts, 'smarten_punctuation', False): - html = self.smarten_punctuation(html) + html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars @@ -653,23 +673,4 @@ class HTMLPreProcessor(object): return html - def smarten_punctuation(self, html): - from calibre.utils.smartypants import smartyPants - from calibre.ebooks.chardet import substitute_entites - from calibre.ebooks.conversion.utils import HeuristicProcessor - preprocessor = HeuristicProcessor(self.extra_opts, self.log) - from uuid import uuid4 - start = 'calibre-smartypants-'+str(uuid4()) - stop = 'calibre-smartypants-'+str(uuid4()) - html = html.replace('', stop) - html = preprocessor.fix_nbsp_indents(html) - html = smartyPants(html) - html = html.replace(start, '') - # convert ellipsis to entities to prevent wrapping - html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) - # convert double dashes to em-dash - html = re.sub(r'\s--\s', u'\u2014', html) - return substitute_entites(html) diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py index 0ac8f47ce5..7a2439c4ec 100644 --- a/src/calibre/ebooks/oeb/polish/main.py +++ b/src/calibre/ebooks/oeb/polish/main.py @@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container from calibre.ebooks.oeb.polish.stats import StatsCollector from calibre.ebooks.oeb.polish.subset import subset_all_fonts from calibre.ebooks.oeb.polish.cover import set_cover +from calibre.ebooks.oeb.polish.replace import smarten_punctuation from calibre.ebooks.oeb.polish.jacket import ( replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket) from calibre.utils.logging import Log @@ -25,6 +26,7 @@ ALL_OPTS = { 'cover': None, 'jacket': False, 'remove_jacket':False, + 'smarten_punctuation':False, } SUPPORTED = {'EPUB', 'AZW3'} @@ -72,6 +74,13 @@ etc.

'''), 'remove_jacket': _('''\

Remove a previous inserted book jacket page.

'''), + +'smarten_punctuation': _('''\ +

Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their +typographically correct equivalents.

+

Note that the algorithm can sometimes generate incorrect results, especially +when single quotes at the start of contractions are involved.

+'''), } def hfix(name, raw): @@ -121,11 +130,6 @@ def polish(file_map, opts, log, report): report(_('Updated metadata jacket')) report(_('Metadata updated\n')) - if opts.subset: - rt(_('Subsetting embedded fonts')) - subset_all_fonts(ebook, stats.font_stats, report) - report('') - if opts.cover: rt(_('Setting cover')) set_cover(ebook, opts.cover, report) @@ -150,6 +154,16 @@ def polish(file_map, opts, log, report): report(_('No metadata jacket found')) report('') + if opts.smarten_punctuation: + rt(_('Smartening punctuation')) + smarten_punctuation(ebook, report) + report('') + + if opts.subset: + rt(_('Subsetting embedded fonts')) + subset_all_fonts(ebook, stats.font_stats, report) + report('') + ebook.commit(outbook) report('-'*70) report(_('Polishing took: %.1f seconds')%(time.time()-st)) @@ -190,6 +204,7 @@ def option_parser(): 'Path to an OPF file. The metadata in the book is updated from the OPF file.')) o('--jacket', '-j', help=CLI_HELP['jacket']) o('--remove-jacket', help=CLI_HELP['remove_jacket']) + o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation']) o('--verbose', help=_('Produce more verbose output, useful for debugging.')) diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py index 0e2f672d42..b26589d5bf 100644 --- a/src/calibre/ebooks/oeb/polish/replace.py +++ b/src/calibre/ebooks/oeb/polish/replace.py @@ -7,10 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import codecs from urlparse import urlparse from cssutils import replaceUrls +from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.oeb.polish.container import guess_type from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links) @@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag): if repl.replaced: container.dirty(name) +def smarten_punctuation(container, report): + from calibre.ebooks.conversion.preprocess import smarten_punctuation + for path in container.spine_items: + name = container.abspath_to_name(path) + changed = False + with container.open(name, 'r+b') as f: + html = container.decode(f.read()) + newhtml = smarten_punctuation(html, container.log) + if newhtml != html: + changed = True + report(_('Smartened punctuation in: %s')%name) + newhtml = strip_encoding_declarations(newhtml) + f.seek(0) + f.truncate() + f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) + if changed: + # Add an encoding declaration (it will be added automatically when + # serialized) + root = container.parsed(name) + for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'): + m.getparent().remove(m) + container.dirty(name) diff --git a/src/calibre/gui2/actions/polish.py b/src/calibre/gui2/actions/polish.py index 4e2e093f15..aeb3a2e332 100644 --- a/src/calibre/gui2/actions/polish.py +++ b/src/calibre/gui2/actions/polish.py @@ -40,6 +40,9 @@ class Polish(QDialog): # {{{ 'subset':_('

Subsetting fonts

%s')%HELP['subset'], + 'smarten_punctuation': + _('

Smarten punctuation

%s')%HELP['smarten_punctuation'], + 'metadata':_('

Updating metadata

' '

This will update all metadata and covers in the' ' ebook files to match the current metadata in the' @@ -61,6 +64,7 @@ class Polish(QDialog): # {{{ count = 0 self.all_actions = OrderedDict([ ('subset', _('Subset all embedded fonts')), + ('smarten_punctuation', _('Smarten punctuation')), ('metadata', _('Update metadata in book files')), ('jacket', _('Add metadata as a "book jacket" page')), ('remove_jacket', _('Remove a previously inserted book jacket')),