Make unsmarten punctuation a global option.

This commit is contained in:
John Schember 2011-09-05 12:24:26 -04:00
parent 123991aea5
commit ec448064aa
11 changed files with 112 additions and 52 deletions

View File

@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
'font_size_mapping',
'line_height', 'minimum_line_height',
'linearize_tables',
'extra_css', 'smarten_punctuation',
'extra_css',
'smarten_punctuation', 'unsmarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'insert_blank_line_size',

View File

@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
)
),
OptionRecommendation(name='unsmarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert fancy quotes, dashes and ellipsis to their '
'plain equivalents.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',

View File

@ -605,6 +605,9 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
if getattr(self.extra_opts, 'unsmarten_punctuation', False):
html = self.unsmarten_punctuation(html)
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:
@ -636,3 +639,12 @@ class HTMLPreProcessor(object):
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)
def unsmarten_punctuation(self, html):
from calibre.utils.unsmarten import unsmarten_html
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor.fix_nbsp_indents(html)
html = unsmarten_html(html)
return substitute_entites(html)

View File

@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
import re
def unsmarten(txt):
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
txt = txt_unsmarten(txt)
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen

View File

@ -15,7 +15,6 @@ from functools import partial
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.txt.unsmarten import unsmarten
class MarkdownMLizer(OEB2HTML):
@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
self.style_italic = False
txt = self.mlize_spine(oeb_book)
if self.opts.unsmarten_punctuation:
txt = unsmarten(txt)
# Do some tidying up
txt = self.tidy_up(txt)

View File

@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
'* plain: Produce plain text.\n'
'* markdown: Produce Markdown formatted text.\n'
'* textile: Produce Textile formatted text.')),
OptionRecommendation(name='unsmarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert fancy quotes, dashes and ellipsis to their '
'plain equivalents.')),
OptionRecommendation(name='keep_links',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove links within the document. This is only ' \

View File

@ -12,8 +12,6 @@ import re
from lxml import etree
from calibre.ebooks.txt.unsmarten import unsmarten
BLOCK_TAGS = [
'div',
@ -78,8 +76,6 @@ class TXTMLizer(object):
output += '\n\n\n\n\n\n'
output = u''.join(output)
output = u'\n'.join(l.rstrip() for l in output.splitlines())
if self.opts.unsmarten_punctuation:
output = unsmarten(output)
output = self.cleanup_text(output)
return output

View File

@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
__docformat__ = 'restructuredtext en'
import re
def unsmarten(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'-', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt) # double quote
txt = re.sub(u'(["\'‘“]|\s)', r"\1{'/}", txt) # apostrophe
txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|||', r"'", txt) # single quote
return txt

View File

@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['change_justification', 'extra_css', 'base_font_size',
'font_size_mapping', 'line_height', 'minimum_line_height',
'linearize_tables', 'smarten_punctuation',
'smarten_punctuation', 'unsmarten_punctuation',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing',
'remove_paragraph_spacing_indent_size',
'insert_blank_line_size',
'input_encoding',
'asciiize', 'keep_ligatures']
'asciiize', 'keep_ligatures',
'linearize_tables']
)
for val, text in [
('original', _('Original')),

View File

@ -7,7 +7,7 @@
<x>0</x>
<y>0</y>
<width>642</width>
<height>500</height>
<height>522</height>
</rect>
</property>
<property name="windowTitle">
@ -84,7 +84,7 @@
<string>...</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<iconset>
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
</property>
<property name="iconSize">
@ -194,13 +194,6 @@
<item row="8" column="2" colspan="3">
<widget class="QComboBox" name="opt_change_justification"/>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="opt_linearize_tables">
<property name="text">
<string>&amp;Linearize tables</string>
</property>
</widget>
</item>
<item row="9" column="1" colspan="4">
<widget class="QCheckBox" name="opt_asciiize">
<property name="text">
@ -215,7 +208,7 @@
</property>
</widget>
</item>
<item row="12" column="0" colspan="5">
<item row="13" column="0" colspan="5">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Extra &amp;CSS</string>
@ -240,13 +233,6 @@
</property>
</widget>
</item>
<item row="10" column="0">
<widget class="QCheckBox" name="opt_smarten_punctuation">
<property name="text">
<string>Smarten &amp;punctuation</string>
</property>
</widget>
</item>
<item row="6" column="3">
<widget class="QLabel" name="label_4">
<property name="text">
@ -273,6 +259,27 @@
</property>
</widget>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="opt_smarten_punctuation">
<property name="text">
<string>Smarten &amp;punctuation</string>
</property>
</widget>
</item>
<item row="10" column="0">
<widget class="QCheckBox" name="opt_unsmarten_punctuation">
<property name="text">
<string>&amp;UnSmarten punctuation</string>
</property>
</widget>
</item>
<item row="10" column="3">
<widget class="QCheckBox" name="opt_linearize_tables">
<property name="text">
<string>&amp;Linearize tables</string>
</property>
</widget>
</item>
</layout>
</widget>
<customwidgets>

View File

@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from lxml import html as lhtml
from calibre import prepare_string_for_xml
from calibre.ebooks.oeb.base import barename
def unsmarten_html(html):
def dump_text(elem):
text = []
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
tags.append(tag)
# Turn the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
# Write the tag.
text.append('<%s%s>' % (tag, at))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
# Don't modify text in pre tags.
if tag == 'pre':
text.append(elem.text)
else:
text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
# Recurse down into tags within the tag we are in.
for item in elem:
text += dump_text(item)
# Close all open tags.
tags.reverse()
for t in tags:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
return text
content = lhtml.fromstring(html)
html = dump_text(content)
html = ''.join(html)
return html
def unsmarten_text(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'--', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash
txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt) # double quote
txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|||', r"'", txt) # single quote
return txt