From ec448064aa37ba11fa0a3e8318311b532cc26e1e Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 5 Sep 2011 12:24:26 -0400 Subject: [PATCH] Make unsmarten punctuation a global option. --- src/calibre/ebooks/conversion/cli.py | 3 +- src/calibre/ebooks/conversion/plumber.py | 7 +++ src/calibre/ebooks/conversion/preprocess.py | 12 ++++ src/calibre/ebooks/textile/unsmarten.py | 3 - src/calibre/ebooks/txt/markdownml.py | 3 - src/calibre/ebooks/txt/output.py | 4 -- src/calibre/ebooks/txt/txtml.py | 4 -- src/calibre/ebooks/txt/unsmarten.py | 18 ------ src/calibre/gui2/convert/look_and_feel.py | 5 +- src/calibre/gui2/convert/look_and_feel.ui | 41 +++++++------ src/calibre/utils/unsmarten.py | 64 +++++++++++++++++++++ 11 files changed, 112 insertions(+), 52 deletions(-) delete mode 100644 src/calibre/ebooks/txt/unsmarten.py create mode 100644 src/calibre/utils/unsmarten.py diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index c605df0de4..ed332acac2 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber): 'font_size_mapping', 'line_height', 'minimum_line_height', 'linearize_tables', - 'extra_css', 'smarten_punctuation', + 'extra_css', + 'smarten_punctuation', 'unsmarten_punctuation', 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'change_justification', 'insert_blank_line', 'insert_blank_line_size', diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index adff954e62..fefc08b19d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation', ) ), +OptionRecommendation(name='unsmarten_punctuation', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Convert fancy quotes, dashes and ellipsis to their ' + 'plain equivalents.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 0f804cc208..d1ccd8a082 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -605,6 +605,9 @@ class HTMLPreProcessor(object): if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) + + if getattr(self.extra_opts, 'unsmarten_punctuation', False): + html = self.unsmarten_punctuation(html) unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars if unsupported_unicode_chars: @@ -636,3 +639,12 @@ class HTMLPreProcessor(object): html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html) + def unsmarten_punctuation(self, html): + from calibre.utils.unsmarten import unsmarten_html + from calibre.ebooks.chardet import substitute_entites + from calibre.ebooks.conversion.utils import HeuristicProcessor + preprocessor = HeuristicProcessor(self.extra_opts, self.log) + html = preprocessor.fix_nbsp_indents(html) + html = unsmarten_html(html) + return substitute_entites(html) + diff --git a/src/calibre/ebooks/textile/unsmarten.py b/src/calibre/ebooks/textile/unsmarten.py index c31bb77c24..94127c5c39 100644 --- a/src/calibre/ebooks/textile/unsmarten.py +++ b/src/calibre/ebooks/textile/unsmarten.py @@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en' import re def unsmarten(txt): - from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten - txt = txt_unsmarten(txt) - txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent txt = re.sub(u'£|£|£', r'{L-}', txt) # pound txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 878633add3..79cfabf65e 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -15,7 +15,6 @@ from functools import partial from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer -from calibre.ebooks.txt.unsmarten import unsmarten class MarkdownMLizer(OEB2HTML): @@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML): self.style_italic = False txt = self.mlize_spine(oeb_book) - if self.opts.unsmarten_punctuation: - txt = unsmarten(txt) # Do some tidying up txt = self.tidy_up(txt) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 47ee734a04..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin): '* plain: Produce plain text.\n' '* markdown: Produce Markdown formatted text.\n' '* textile: Produce Textile formatted text.')), - OptionRecommendation(name='unsmarten_punctuation', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Convert fancy quotes, dashes and ellipsis to their ' - 'plain equivalents.')), OptionRecommendation(name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not remove links within the document. This is only ' \ diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 5d2b03d98e..2320fbbbc7 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -12,8 +12,6 @@ import re from lxml import etree -from calibre.ebooks.txt.unsmarten import unsmarten - BLOCK_TAGS = [ 'div', @@ -78,8 +76,6 @@ class TXTMLizer(object): output += '\n\n\n\n\n\n' output = u''.join(output) output = u'\n'.join(l.rstrip() for l in output.splitlines()) - if self.opts.unsmarten_punctuation: - output = unsmarten(output) output = self.cleanup_text(output) return output diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py deleted file mode 100644 index 53f686c2fc..0000000000 --- a/src/calibre/ebooks/txt/unsmarten.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, Leigh Parry ' -__docformat__ = 'restructuredtext en' - -import re - -def unsmarten(txt): - txt = re.sub(u'–|–|–', r'-', txt) # en-dash - txt = re.sub(u'—|—|—', r'--', txt) # em-dash - txt = re.sub(u'…|…|…', r'...', txt) # ellipsis - - txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote - txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe - txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote - - return txt diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index 9b008c0d6d..5ca7e1ea02 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['change_justification', 'extra_css', 'base_font_size', 'font_size_mapping', 'line_height', 'minimum_line_height', - 'linearize_tables', 'smarten_punctuation', + 'smarten_punctuation', 'unsmarten_punctuation', 'disable_font_rescaling', 'insert_blank_line', 'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size', 'insert_blank_line_size', 'input_encoding', - 'asciiize', 'keep_ligatures'] + 'asciiize', 'keep_ligatures', + 'linearize_tables'] ) for val, text in [ ('original', _('Original')), diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 0aa91e0f47..055d569212 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -7,7 +7,7 @@ 0 0 642 - 500 + 522 @@ -84,7 +84,7 @@ ... - + :/images/wizard.png:/images/wizard.png @@ -194,13 +194,6 @@ - - - - &Linearize tables - - - @@ -215,7 +208,7 @@ - + Extra &CSS @@ -240,13 +233,6 @@ - - - - Smarten &punctuation - - - @@ -273,6 +259,27 @@ + + + + Smarten &punctuation + + + + + + + &UnSmarten punctuation + + + + + + + &Linearize tables + + + diff --git a/src/calibre/utils/unsmarten.py b/src/calibre/utils/unsmarten.py new file mode 100644 index 0000000000..f37f9fb010 --- /dev/null +++ b/src/calibre/utils/unsmarten.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from lxml import html as lhtml + +from calibre import prepare_string_for_xml +from calibre.ebooks.oeb.base import barename + +def unsmarten_html(html): + def dump_text(elem): + text = [] + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + tags.append(tag) + # Turn the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + # Write the tag. + text.append('<%s%s>' % (tag, at)) + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + # Don't modify text in pre tags. + if tag == 'pre': + text.append(elem.text) + else: + text.append(prepare_string_for_xml(unsmarten_text(elem.text))) + # Recurse down into tags within the tag we are in. + for item in elem: + text += dump_text(item) + # Close all open tags. + tags.reverse() + for t in tags: + text.append('' % t) + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(prepare_string_for_xml(unsmarten_text(elem.tail))) + return text + + content = lhtml.fromstring(html) + html = dump_text(content) + html = ''.join(html) + + return html + + +def unsmarten_text(txt): + txt = re.sub(u'–|–|–', r'--', txt) # en-dash + txt = re.sub(u'—|—|—', r'---', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + return txt +