From 25fd859df6d0d2acd0fb20eb5a52cfb875e058ce Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 17 Feb 2013 14:13:45 +0530
Subject: [PATCH] Book polishing: Add an option to smarten punctuation in the
book when polishing
---
src/calibre/ebooks/conversion/preprocess.py | 41 +++++++++++----------
src/calibre/ebooks/oeb/polish/main.py | 25 ++++++++++---
src/calibre/ebooks/oeb/polish/replace.py | 24 ++++++++++++
src/calibre/gui2/actions/polish.py | 4 ++
4 files changed, 69 insertions(+), 25 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index bbbc96a7a5..7e5873edd2 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,6 +62,26 @@ def wrap_lines(match):
else:
return ital+' '
+def smarten_punctuation(html, log):
+ from calibre.utils.smartypants import smartyPants
+ from calibre.ebooks.chardet import substitute_entites
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ preprocessor = HeuristicProcessor(log=log)
+ from uuid import uuid4
+ start = 'calibre-smartypants-'+str(uuid4())
+ stop = 'calibre-smartypants-'+str(uuid4())
+ html = html.replace('', stop)
+ html = preprocessor.fix_nbsp_indents(html)
+ html = smartyPants(html)
+ html = html.replace(start, '')
+ # convert ellipsis to entities to prevent wrapping
+ html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
+ # convert double dashes to em-dash
+ html = re.sub(r'\s--\s', u'\u2014', html)
+ return substitute_entites(html)
+
class DocAnalysis(object):
'''
Provides various text analysis functions to determine how the document is structured.
@@ -638,7 +658,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
- html = self.smarten_punctuation(html)
+ html = smarten_punctuation(html, self.log)
try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
@@ -653,23 +673,4 @@ class HTMLPreProcessor(object):
return html
- def smarten_punctuation(self, html):
- from calibre.utils.smartypants import smartyPants
- from calibre.ebooks.chardet import substitute_entites
- from calibre.ebooks.conversion.utils import HeuristicProcessor
- preprocessor = HeuristicProcessor(self.extra_opts, self.log)
- from uuid import uuid4
- start = 'calibre-smartypants-'+str(uuid4())
- stop = 'calibre-smartypants-'+str(uuid4())
- html = html.replace('', stop)
- html = preprocessor.fix_nbsp_indents(html)
- html = smartyPants(html)
- html = html.replace(start, '')
- # convert ellipsis to entities to prevent wrapping
- html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
- # convert double dashes to em-dash
- html = re.sub(r'\s--\s', u'\u2014', html)
- return substitute_entites(html)
diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py
index 0ac8f47ce5..7a2439c4ec 100644
--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container
from calibre.ebooks.oeb.polish.stats import StatsCollector
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
from calibre.ebooks.oeb.polish.cover import set_cover
+from calibre.ebooks.oeb.polish.replace import smarten_punctuation
from calibre.ebooks.oeb.polish.jacket import (
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
from calibre.utils.logging import Log
@@ -25,6 +26,7 @@ ALL_OPTS = {
'cover': None,
'jacket': False,
'remove_jacket':False,
+ 'smarten_punctuation':False,
}
SUPPORTED = {'EPUB', 'AZW3'}
@@ -72,6 +74,13 @@ etc.
'''),
'remove_jacket': _('''\
Remove a previous inserted book jacket page.
'''),
+
+'smarten_punctuation': _('''\
+Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their
+typographically correct equivalents.
+Note that the algorithm can sometimes generate incorrect results, especially
+when single quotes at the start of contractions are involved.
+'''),
}
def hfix(name, raw):
@@ -121,11 +130,6 @@ def polish(file_map, opts, log, report):
report(_('Updated metadata jacket'))
report(_('Metadata updated\n'))
- if opts.subset:
- rt(_('Subsetting embedded fonts'))
- subset_all_fonts(ebook, stats.font_stats, report)
- report('')
-
if opts.cover:
rt(_('Setting cover'))
set_cover(ebook, opts.cover, report)
@@ -150,6 +154,16 @@ def polish(file_map, opts, log, report):
report(_('No metadata jacket found'))
report('')
+ if opts.smarten_punctuation:
+ rt(_('Smartening punctuation'))
+ smarten_punctuation(ebook, report)
+ report('')
+
+ if opts.subset:
+ rt(_('Subsetting embedded fonts'))
+ subset_all_fonts(ebook, stats.font_stats, report)
+ report('')
+
ebook.commit(outbook)
report('-'*70)
report(_('Polishing took: %.1f seconds')%(time.time()-st))
@@ -190,6 +204,7 @@ def option_parser():
'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
o('--jacket', '-j', help=CLI_HELP['jacket'])
o('--remove-jacket', help=CLI_HELP['remove_jacket'])
+ o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py
index 0e2f672d42..b26589d5bf 100644
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@@ -7,10 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import codecs
from urlparse import urlparse
from cssutils import replaceUrls
+from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
@@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag):
if repl.replaced:
container.dirty(name)
+def smarten_punctuation(container, report):
+ from calibre.ebooks.conversion.preprocess import smarten_punctuation
+ for path in container.spine_items:
+ name = container.abspath_to_name(path)
+ changed = False
+ with container.open(name, 'r+b') as f:
+ html = container.decode(f.read())
+ newhtml = smarten_punctuation(html, container.log)
+ if newhtml != html:
+ changed = True
+ report(_('Smartened punctuation in: %s')%name)
+ newhtml = strip_encoding_declarations(newhtml)
+ f.seek(0)
+ f.truncate()
+ f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
+ if changed:
+ # Add an encoding declaration (it will be added automatically when
+ # serialized)
+ root = container.parsed(name)
+ for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
+ m.getparent().remove(m)
+ container.dirty(name)
diff --git a/src/calibre/gui2/actions/polish.py b/src/calibre/gui2/actions/polish.py
index 4e2e093f15..aeb3a2e332 100644
--- a/src/calibre/gui2/actions/polish.py
+++ b/src/calibre/gui2/actions/polish.py
@@ -40,6 +40,9 @@ class Polish(QDialog): # {{{
'subset':_('Subsetting fonts
%s')%HELP['subset'],
+ 'smarten_punctuation':
+ _('Smarten punctuation
%s')%HELP['smarten_punctuation'],
+
'metadata':_('Updating metadata
'
'This will update all metadata and covers in the'
' ebook files to match the current metadata in the'
@@ -61,6 +64,7 @@ class Polish(QDialog): # {{{
count = 0
self.all_actions = OrderedDict([
('subset', _('Subset all embedded fonts')),
+ ('smarten_punctuation', _('Smarten punctuation')),
('metadata', _('Update metadata in book files')),
('jacket', _('Add metadata as a "book jacket" page')),
('remove_jacket', _('Remove a previously inserted book jacket')),