mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Book polishing: Add an option to smarten punctuation in the book when polishing
This commit is contained in:
parent
bb27c4ebda
commit
25fd859df6
@ -62,6 +62,26 @@ def wrap_lines(match):
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
def smarten_punctuation(html, log):
|
||||
from calibre.utils.smartypants import smartyPants
|
||||
from calibre.ebooks.chardet import substitute_entites
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(log=log)
|
||||
from uuid import uuid4
|
||||
start = 'calibre-smartypants-'+str(uuid4())
|
||||
stop = 'calibre-smartypants-'+str(uuid4())
|
||||
html = html.replace('<!--', start)
|
||||
html = html.replace('-->', stop)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
# convert ellipsis to entities to prevent wrapping
|
||||
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
# convert double dashes to em-dash
|
||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
@ -638,7 +658,7 @@ class HTMLPreProcessor(object):
|
||||
html = preprocessor(html)
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
@ -653,23 +673,4 @@ class HTMLPreProcessor(object):
|
||||
|
||||
return html
|
||||
|
||||
def smarten_punctuation(self, html):
|
||||
from calibre.utils.smartypants import smartyPants
|
||||
from calibre.ebooks.chardet import substitute_entites
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
from uuid import uuid4
|
||||
start = 'calibre-smartypants-'+str(uuid4())
|
||||
stop = 'calibre-smartypants-'+str(uuid4())
|
||||
html = html.replace('<!--', start)
|
||||
html = html.replace('-->', stop)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
# convert ellipsis to entities to prevent wrapping
|
||||
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
# convert double dashes to em-dash
|
||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
|
@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container
|
||||
from calibre.ebooks.oeb.polish.stats import StatsCollector
|
||||
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
|
||||
from calibre.ebooks.oeb.polish.cover import set_cover
|
||||
from calibre.ebooks.oeb.polish.replace import smarten_punctuation
|
||||
from calibre.ebooks.oeb.polish.jacket import (
|
||||
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
|
||||
from calibre.utils.logging import Log
|
||||
@ -25,6 +26,7 @@ ALL_OPTS = {
|
||||
'cover': None,
|
||||
'jacket': False,
|
||||
'remove_jacket':False,
|
||||
'smarten_punctuation':False,
|
||||
}
|
||||
|
||||
SUPPORTED = {'EPUB', 'AZW3'}
|
||||
@ -72,6 +74,13 @@ etc.</p>'''),
|
||||
'remove_jacket': _('''\
|
||||
<p>Remove a previous inserted book jacket page.</p>
|
||||
'''),
|
||||
|
||||
'smarten_punctuation': _('''\
|
||||
<p>Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their
|
||||
typographically correct equivalents.</p>
|
||||
<p>Note that the algorithm can sometimes generate incorrect results, especially
|
||||
when single quotes at the start of contractions are involved.</p>
|
||||
'''),
|
||||
}
|
||||
|
||||
def hfix(name, raw):
|
||||
@ -121,11 +130,6 @@ def polish(file_map, opts, log, report):
|
||||
report(_('Updated metadata jacket'))
|
||||
report(_('Metadata updated\n'))
|
||||
|
||||
if opts.subset:
|
||||
rt(_('Subsetting embedded fonts'))
|
||||
subset_all_fonts(ebook, stats.font_stats, report)
|
||||
report('')
|
||||
|
||||
if opts.cover:
|
||||
rt(_('Setting cover'))
|
||||
set_cover(ebook, opts.cover, report)
|
||||
@ -150,6 +154,16 @@ def polish(file_map, opts, log, report):
|
||||
report(_('No metadata jacket found'))
|
||||
report('')
|
||||
|
||||
if opts.smarten_punctuation:
|
||||
rt(_('Smartening punctuation'))
|
||||
smarten_punctuation(ebook, report)
|
||||
report('')
|
||||
|
||||
if opts.subset:
|
||||
rt(_('Subsetting embedded fonts'))
|
||||
subset_all_fonts(ebook, stats.font_stats, report)
|
||||
report('')
|
||||
|
||||
ebook.commit(outbook)
|
||||
report('-'*70)
|
||||
report(_('Polishing took: %.1f seconds')%(time.time()-st))
|
||||
@ -190,6 +204,7 @@ def option_parser():
|
||||
'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
|
||||
o('--jacket', '-j', help=CLI_HELP['jacket'])
|
||||
o('--remove-jacket', help=CLI_HELP['remove_jacket'])
|
||||
o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
|
||||
|
||||
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
|
||||
|
||||
|
@ -7,10 +7,12 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import codecs
|
||||
from urlparse import urlparse
|
||||
|
||||
from cssutils import replaceUrls
|
||||
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
from calibre.ebooks.oeb.polish.container import guess_type
|
||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
|
||||
|
||||
@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag):
|
||||
if repl.replaced:
|
||||
container.dirty(name)
|
||||
|
||||
def smarten_punctuation(container, report):
|
||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||
for path in container.spine_items:
|
||||
name = container.abspath_to_name(path)
|
||||
changed = False
|
||||
with container.open(name, 'r+b') as f:
|
||||
html = container.decode(f.read())
|
||||
newhtml = smarten_punctuation(html, container.log)
|
||||
if newhtml != html:
|
||||
changed = True
|
||||
report(_('Smartened punctuation in: %s')%name)
|
||||
newhtml = strip_encoding_declarations(newhtml)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
|
||||
if changed:
|
||||
# Add an encoding declaration (it will be added automatically when
|
||||
# serialized)
|
||||
root = container.parsed(name)
|
||||
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
|
||||
m.getparent().remove(m)
|
||||
container.dirty(name)
|
||||
|
||||
|
@ -40,6 +40,9 @@ class Polish(QDialog): # {{{
|
||||
|
||||
'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],
|
||||
|
||||
'smarten_punctuation':
|
||||
_('<h3>Smarten punctuation</h3>%s')%HELP['smarten_punctuation'],
|
||||
|
||||
'metadata':_('<h3>Updating metadata</h3>'
|
||||
'<p>This will update all metadata and covers in the'
|
||||
' ebook files to match the current metadata in the'
|
||||
@ -61,6 +64,7 @@ class Polish(QDialog): # {{{
|
||||
count = 0
|
||||
self.all_actions = OrderedDict([
|
||||
('subset', _('Subset all embedded fonts')),
|
||||
('smarten_punctuation', _('Smarten punctuation')),
|
||||
('metadata', _('Update metadata in book files')),
|
||||
('jacket', _('Add metadata as a "book jacket" page')),
|
||||
('remove_jacket', _('Remove a previously inserted book jacket')),
|
||||
|
Loading…
x
Reference in New Issue
Block a user