Book polishing: Add an option to smarten punctuation in the book when polishing

This commit is contained in:
Kovid Goyal 2013-02-17 14:13:45 +05:30
parent bb27c4ebda
commit 25fd859df6
4 changed files with 69 additions and 25 deletions

View File

@ -62,6 +62,26 @@ def wrap_lines(match):
else:
return ital+' '
def smarten_punctuation(html, log):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)
class DocAnalysis(object):
'''
Provides various text analysis functions to determine how the document is structured.
@ -638,7 +658,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
html = smarten_punctuation(html, self.log)
try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
@ -653,23 +673,4 @@ class HTMLPreProcessor(object):
return html
def smarten_punctuation(self, html):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)

View File

@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container
from calibre.ebooks.oeb.polish.stats import StatsCollector
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
from calibre.ebooks.oeb.polish.cover import set_cover
from calibre.ebooks.oeb.polish.replace import smarten_punctuation
from calibre.ebooks.oeb.polish.jacket import (
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
from calibre.utils.logging import Log
@ -25,6 +26,7 @@ ALL_OPTS = {
'cover': None,
'jacket': False,
'remove_jacket':False,
'smarten_punctuation':False,
}
SUPPORTED = {'EPUB', 'AZW3'}
@ -72,6 +74,13 @@ etc.</p>'''),
'remove_jacket': _('''\
<p>Remove a previous inserted book jacket page.</p>
'''),
'smarten_punctuation': _('''\
<p>Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their
typographically correct equivalents.</p>
<p>Note that the algorithm can sometimes generate incorrect results, especially
when single quotes at the start of contractions are involved.</p>
'''),
}
def hfix(name, raw):
@ -121,11 +130,6 @@ def polish(file_map, opts, log, report):
report(_('Updated metadata jacket'))
report(_('Metadata updated\n'))
if opts.subset:
rt(_('Subsetting embedded fonts'))
subset_all_fonts(ebook, stats.font_stats, report)
report('')
if opts.cover:
rt(_('Setting cover'))
set_cover(ebook, opts.cover, report)
@ -150,6 +154,16 @@ def polish(file_map, opts, log, report):
report(_('No metadata jacket found'))
report('')
if opts.smarten_punctuation:
rt(_('Smartening punctuation'))
smarten_punctuation(ebook, report)
report('')
if opts.subset:
rt(_('Subsetting embedded fonts'))
subset_all_fonts(ebook, stats.font_stats, report)
report('')
ebook.commit(outbook)
report('-'*70)
report(_('Polishing took: %.1f seconds')%(time.time()-st))
@ -190,6 +204,7 @@ def option_parser():
'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
o('--jacket', '-j', help=CLI_HELP['jacket'])
o('--remove-jacket', help=CLI_HELP['remove_jacket'])
o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))

View File

@ -7,10 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs
from urlparse import urlparse
from cssutils import replaceUrls
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag):
if repl.replaced:
container.dirty(name)
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
for path in container.spine_items:
name = container.abspath_to_name(path)
changed = False
with container.open(name, 'r+b') as f:
html = container.decode(f.read())
newhtml = smarten_punctuation(html, container.log)
if newhtml != html:
changed = True
report(_('Smartened punctuation in: %s')%name)
newhtml = strip_encoding_declarations(newhtml)
f.seek(0)
f.truncate()
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
if changed:
# Add an encoding declaration (it will be added automatically when
# serialized)
root = container.parsed(name)
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
m.getparent().remove(m)
container.dirty(name)

View File

@ -40,6 +40,9 @@ class Polish(QDialog): # {{{
'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],
'smarten_punctuation':
_('<h3>Smarten punctuation</h3>%s')%HELP['smarten_punctuation'],
'metadata':_('<h3>Updating metadata</h3>'
'<p>This will update all metadata and covers in the'
' ebook files to match the current metadata in the'
@ -61,6 +64,7 @@ class Polish(QDialog): # {{{
count = 0
self.all_actions = OrderedDict([
('subset', _('Subset all embedded fonts')),
('smarten_punctuation', _('Smarten punctuation')),
('metadata', _('Update metadata in book files')),
('jacket', _('Add metadata as a "book jacket" page')),
('remove_jacket', _('Remove a previously inserted book jacket')),