Book polishing: Add an option to smarten punctuation in the book when polishing

This commit is contained in:
Kovid Goyal 2013-02-17 14:13:45 +05:30
parent bb27c4ebda
commit 25fd859df6
4 changed files with 69 additions and 25 deletions

View File

@ -62,6 +62,26 @@ def wrap_lines(match):
else: else:
return ital+' ' return ital+' '
def smarten_punctuation(html, log):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)
class DocAnalysis(object): class DocAnalysis(object):
''' '''
Provides various text analysis functions to determine how the document is structured. Provides various text analysis functions to determine how the document is structured.
@ -638,7 +658,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html) html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html) html = smarten_punctuation(html, self.log)
try: try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
@ -653,23 +673,4 @@ class HTMLPreProcessor(object):
return html return html
def smarten_punctuation(self, html):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)

View File

@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container
from calibre.ebooks.oeb.polish.stats import StatsCollector from calibre.ebooks.oeb.polish.stats import StatsCollector
from calibre.ebooks.oeb.polish.subset import subset_all_fonts from calibre.ebooks.oeb.polish.subset import subset_all_fonts
from calibre.ebooks.oeb.polish.cover import set_cover from calibre.ebooks.oeb.polish.cover import set_cover
from calibre.ebooks.oeb.polish.replace import smarten_punctuation
from calibre.ebooks.oeb.polish.jacket import ( from calibre.ebooks.oeb.polish.jacket import (
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket) replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
from calibre.utils.logging import Log from calibre.utils.logging import Log
@ -25,6 +26,7 @@ ALL_OPTS = {
'cover': None, 'cover': None,
'jacket': False, 'jacket': False,
'remove_jacket':False, 'remove_jacket':False,
'smarten_punctuation':False,
} }
SUPPORTED = {'EPUB', 'AZW3'} SUPPORTED = {'EPUB', 'AZW3'}
@ -72,6 +74,13 @@ etc.</p>'''),
'remove_jacket': _('''\ 'remove_jacket': _('''\
<p>Remove a previous inserted book jacket page.</p> <p>Remove a previous inserted book jacket page.</p>
'''), '''),
'smarten_punctuation': _('''\
<p>Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their
typographically correct equivalents.</p>
<p>Note that the algorithm can sometimes generate incorrect results, especially
when single quotes at the start of contractions are involved.</p>
'''),
} }
def hfix(name, raw): def hfix(name, raw):
@ -121,11 +130,6 @@ def polish(file_map, opts, log, report):
report(_('Updated metadata jacket')) report(_('Updated metadata jacket'))
report(_('Metadata updated\n')) report(_('Metadata updated\n'))
if opts.subset:
rt(_('Subsetting embedded fonts'))
subset_all_fonts(ebook, stats.font_stats, report)
report('')
if opts.cover: if opts.cover:
rt(_('Setting cover')) rt(_('Setting cover'))
set_cover(ebook, opts.cover, report) set_cover(ebook, opts.cover, report)
@ -150,6 +154,16 @@ def polish(file_map, opts, log, report):
report(_('No metadata jacket found')) report(_('No metadata jacket found'))
report('') report('')
if opts.smarten_punctuation:
rt(_('Smartening punctuation'))
smarten_punctuation(ebook, report)
report('')
if opts.subset:
rt(_('Subsetting embedded fonts'))
subset_all_fonts(ebook, stats.font_stats, report)
report('')
ebook.commit(outbook) ebook.commit(outbook)
report('-'*70) report('-'*70)
report(_('Polishing took: %.1f seconds')%(time.time()-st)) report(_('Polishing took: %.1f seconds')%(time.time()-st))
@ -190,6 +204,7 @@ def option_parser():
'Path to an OPF file. The metadata in the book is updated from the OPF file.')) 'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
o('--jacket', '-j', help=CLI_HELP['jacket']) o('--jacket', '-j', help=CLI_HELP['jacket'])
o('--remove-jacket', help=CLI_HELP['remove_jacket']) o('--remove-jacket', help=CLI_HELP['remove_jacket'])
o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
o('--verbose', help=_('Produce more verbose output, useful for debugging.')) o('--verbose', help=_('Produce more verbose output, useful for debugging.'))

View File

@ -7,10 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import codecs
from urlparse import urlparse from urlparse import urlparse
from cssutils import replaceUrls from cssutils import replaceUrls
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.polish.container import guess_type from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links) from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag):
if repl.replaced: if repl.replaced:
container.dirty(name) container.dirty(name)
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
for path in container.spine_items:
name = container.abspath_to_name(path)
changed = False
with container.open(name, 'r+b') as f:
html = container.decode(f.read())
newhtml = smarten_punctuation(html, container.log)
if newhtml != html:
changed = True
report(_('Smartened punctuation in: %s')%name)
newhtml = strip_encoding_declarations(newhtml)
f.seek(0)
f.truncate()
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
if changed:
# Add an encoding declaration (it will be added automatically when
# serialized)
root = container.parsed(name)
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
m.getparent().remove(m)
container.dirty(name)

View File

@ -40,6 +40,9 @@ class Polish(QDialog): # {{{
'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'], 'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],
'smarten_punctuation':
_('<h3>Smarten punctuation</h3>%s')%HELP['smarten_punctuation'],
'metadata':_('<h3>Updating metadata</h3>' 'metadata':_('<h3>Updating metadata</h3>'
'<p>This will update all metadata and covers in the' '<p>This will update all metadata and covers in the'
' ebook files to match the current metadata in the' ' ebook files to match the current metadata in the'
@ -61,6 +64,7 @@ class Polish(QDialog): # {{{
count = 0 count = 0
self.all_actions = OrderedDict([ self.all_actions = OrderedDict([
('subset', _('Subset all embedded fonts')), ('subset', _('Subset all embedded fonts')),
('smarten_punctuation', _('Smarten punctuation')),
('metadata', _('Update metadata in book files')), ('metadata', _('Update metadata in book files')),
('jacket', _('Add metadata as a "book jacket" page')), ('jacket', _('Add metadata as a "book jacket" page')),
('remove_jacket', _('Remove a previously inserted book jacket')), ('remove_jacket', _('Remove a previously inserted book jacket')),