mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Book polishing: Add an option to smarten punctuation in the book when polishing
This commit is contained in:
parent
bb27c4ebda
commit
25fd859df6
@ -62,6 +62,26 @@ def wrap_lines(match):
|
|||||||
else:
|
else:
|
||||||
return ital+' '
|
return ital+' '
|
||||||
|
|
||||||
|
def smarten_punctuation(html, log):
|
||||||
|
from calibre.utils.smartypants import smartyPants
|
||||||
|
from calibre.ebooks.chardet import substitute_entites
|
||||||
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
preprocessor = HeuristicProcessor(log=log)
|
||||||
|
from uuid import uuid4
|
||||||
|
start = 'calibre-smartypants-'+str(uuid4())
|
||||||
|
stop = 'calibre-smartypants-'+str(uuid4())
|
||||||
|
html = html.replace('<!--', start)
|
||||||
|
html = html.replace('-->', stop)
|
||||||
|
html = preprocessor.fix_nbsp_indents(html)
|
||||||
|
html = smartyPants(html)
|
||||||
|
html = html.replace(start, '<!--')
|
||||||
|
html = html.replace(stop, '-->')
|
||||||
|
# convert ellipsis to entities to prevent wrapping
|
||||||
|
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||||
|
# convert double dashes to em-dash
|
||||||
|
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||||
|
return substitute_entites(html)
|
||||||
|
|
||||||
class DocAnalysis(object):
|
class DocAnalysis(object):
|
||||||
'''
|
'''
|
||||||
Provides various text analysis functions to determine how the document is structured.
|
Provides various text analysis functions to determine how the document is structured.
|
||||||
@ -638,7 +658,7 @@ class HTMLPreProcessor(object):
|
|||||||
html = preprocessor(html)
|
html = preprocessor(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = self.smarten_punctuation(html)
|
html = smarten_punctuation(html, self.log)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||||
@ -653,23 +673,4 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def smarten_punctuation(self, html):
|
|
||||||
from calibre.utils.smartypants import smartyPants
|
|
||||||
from calibre.ebooks.chardet import substitute_entites
|
|
||||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
|
||||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
|
||||||
from uuid import uuid4
|
|
||||||
start = 'calibre-smartypants-'+str(uuid4())
|
|
||||||
stop = 'calibre-smartypants-'+str(uuid4())
|
|
||||||
html = html.replace('<!--', start)
|
|
||||||
html = html.replace('-->', stop)
|
|
||||||
html = preprocessor.fix_nbsp_indents(html)
|
|
||||||
html = smartyPants(html)
|
|
||||||
html = html.replace(start, '<!--')
|
|
||||||
html = html.replace(stop, '-->')
|
|
||||||
# convert ellipsis to entities to prevent wrapping
|
|
||||||
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
|
||||||
# convert double dashes to em-dash
|
|
||||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
|
||||||
return substitute_entites(html)
|
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ from calibre.ebooks.oeb.polish.container import get_container
|
|||||||
from calibre.ebooks.oeb.polish.stats import StatsCollector
|
from calibre.ebooks.oeb.polish.stats import StatsCollector
|
||||||
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
|
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
|
||||||
from calibre.ebooks.oeb.polish.cover import set_cover
|
from calibre.ebooks.oeb.polish.cover import set_cover
|
||||||
|
from calibre.ebooks.oeb.polish.replace import smarten_punctuation
|
||||||
from calibre.ebooks.oeb.polish.jacket import (
|
from calibre.ebooks.oeb.polish.jacket import (
|
||||||
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
|
replace_jacket, add_or_replace_jacket, find_existing_jacket, remove_jacket)
|
||||||
from calibre.utils.logging import Log
|
from calibre.utils.logging import Log
|
||||||
@ -25,6 +26,7 @@ ALL_OPTS = {
|
|||||||
'cover': None,
|
'cover': None,
|
||||||
'jacket': False,
|
'jacket': False,
|
||||||
'remove_jacket':False,
|
'remove_jacket':False,
|
||||||
|
'smarten_punctuation':False,
|
||||||
}
|
}
|
||||||
|
|
||||||
SUPPORTED = {'EPUB', 'AZW3'}
|
SUPPORTED = {'EPUB', 'AZW3'}
|
||||||
@ -72,6 +74,13 @@ etc.</p>'''),
|
|||||||
'remove_jacket': _('''\
|
'remove_jacket': _('''\
|
||||||
<p>Remove a previous inserted book jacket page.</p>
|
<p>Remove a previous inserted book jacket page.</p>
|
||||||
'''),
|
'''),
|
||||||
|
|
||||||
|
'smarten_punctuation': _('''\
|
||||||
|
<p>Convert plain text, dashes, ellipsis, multiple hyphens, etc. into their
|
||||||
|
typographically correct equivalents.</p>
|
||||||
|
<p>Note that the algorithm can sometimes generate incorrect results, especially
|
||||||
|
when single quotes at the start of contractions are involved.</p>
|
||||||
|
'''),
|
||||||
}
|
}
|
||||||
|
|
||||||
def hfix(name, raw):
|
def hfix(name, raw):
|
||||||
@ -121,11 +130,6 @@ def polish(file_map, opts, log, report):
|
|||||||
report(_('Updated metadata jacket'))
|
report(_('Updated metadata jacket'))
|
||||||
report(_('Metadata updated\n'))
|
report(_('Metadata updated\n'))
|
||||||
|
|
||||||
if opts.subset:
|
|
||||||
rt(_('Subsetting embedded fonts'))
|
|
||||||
subset_all_fonts(ebook, stats.font_stats, report)
|
|
||||||
report('')
|
|
||||||
|
|
||||||
if opts.cover:
|
if opts.cover:
|
||||||
rt(_('Setting cover'))
|
rt(_('Setting cover'))
|
||||||
set_cover(ebook, opts.cover, report)
|
set_cover(ebook, opts.cover, report)
|
||||||
@ -150,6 +154,16 @@ def polish(file_map, opts, log, report):
|
|||||||
report(_('No metadata jacket found'))
|
report(_('No metadata jacket found'))
|
||||||
report('')
|
report('')
|
||||||
|
|
||||||
|
if opts.smarten_punctuation:
|
||||||
|
rt(_('Smartening punctuation'))
|
||||||
|
smarten_punctuation(ebook, report)
|
||||||
|
report('')
|
||||||
|
|
||||||
|
if opts.subset:
|
||||||
|
rt(_('Subsetting embedded fonts'))
|
||||||
|
subset_all_fonts(ebook, stats.font_stats, report)
|
||||||
|
report('')
|
||||||
|
|
||||||
ebook.commit(outbook)
|
ebook.commit(outbook)
|
||||||
report('-'*70)
|
report('-'*70)
|
||||||
report(_('Polishing took: %.1f seconds')%(time.time()-st))
|
report(_('Polishing took: %.1f seconds')%(time.time()-st))
|
||||||
@ -190,6 +204,7 @@ def option_parser():
|
|||||||
'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
|
'Path to an OPF file. The metadata in the book is updated from the OPF file.'))
|
||||||
o('--jacket', '-j', help=CLI_HELP['jacket'])
|
o('--jacket', '-j', help=CLI_HELP['jacket'])
|
||||||
o('--remove-jacket', help=CLI_HELP['remove_jacket'])
|
o('--remove-jacket', help=CLI_HELP['remove_jacket'])
|
||||||
|
o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
|
||||||
|
|
||||||
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
|
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
|
||||||
|
|
||||||
|
@ -7,10 +7,12 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import codecs
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
|
||||||
from cssutils import replaceUrls
|
from cssutils import replaceUrls
|
||||||
|
|
||||||
|
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||||
from calibre.ebooks.oeb.polish.container import guess_type
|
from calibre.ebooks.oeb.polish.container import guess_type
|
||||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
|
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
|
||||||
|
|
||||||
@ -58,4 +60,26 @@ def replace_links(container, link_map, frag_map=lambda name, frag:frag):
|
|||||||
if repl.replaced:
|
if repl.replaced:
|
||||||
container.dirty(name)
|
container.dirty(name)
|
||||||
|
|
||||||
|
def smarten_punctuation(container, report):
|
||||||
|
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||||
|
for path in container.spine_items:
|
||||||
|
name = container.abspath_to_name(path)
|
||||||
|
changed = False
|
||||||
|
with container.open(name, 'r+b') as f:
|
||||||
|
html = container.decode(f.read())
|
||||||
|
newhtml = smarten_punctuation(html, container.log)
|
||||||
|
if newhtml != html:
|
||||||
|
changed = True
|
||||||
|
report(_('Smartened punctuation in: %s')%name)
|
||||||
|
newhtml = strip_encoding_declarations(newhtml)
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
|
||||||
|
if changed:
|
||||||
|
# Add an encoding declaration (it will be added automatically when
|
||||||
|
# serialized)
|
||||||
|
root = container.parsed(name)
|
||||||
|
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
|
||||||
|
m.getparent().remove(m)
|
||||||
|
container.dirty(name)
|
||||||
|
|
||||||
|
@ -40,6 +40,9 @@ class Polish(QDialog): # {{{
|
|||||||
|
|
||||||
'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],
|
'subset':_('<h3>Subsetting fonts</h3>%s')%HELP['subset'],
|
||||||
|
|
||||||
|
'smarten_punctuation':
|
||||||
|
_('<h3>Smarten punctuation</h3>%s')%HELP['smarten_punctuation'],
|
||||||
|
|
||||||
'metadata':_('<h3>Updating metadata</h3>'
|
'metadata':_('<h3>Updating metadata</h3>'
|
||||||
'<p>This will update all metadata and covers in the'
|
'<p>This will update all metadata and covers in the'
|
||||||
' ebook files to match the current metadata in the'
|
' ebook files to match the current metadata in the'
|
||||||
@ -61,6 +64,7 @@ class Polish(QDialog): # {{{
|
|||||||
count = 0
|
count = 0
|
||||||
self.all_actions = OrderedDict([
|
self.all_actions = OrderedDict([
|
||||||
('subset', _('Subset all embedded fonts')),
|
('subset', _('Subset all embedded fonts')),
|
||||||
|
('smarten_punctuation', _('Smarten punctuation')),
|
||||||
('metadata', _('Update metadata in book files')),
|
('metadata', _('Update metadata in book files')),
|
||||||
('jacket', _('Add metadata as a "book jacket" page')),
|
('jacket', _('Add metadata as a "book jacket" page')),
|
||||||
('remove_jacket', _('Remove a previously inserted book jacket')),
|
('remove_jacket', _('Remove a previously inserted book jacket')),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user