From ec448064aa37ba11fa0a3e8318311b532cc26e1e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 5 Sep 2011 12:24:26 -0400
Subject: [PATCH] Make unsmarten punctuation a global option.

---
 src/calibre/ebooks/conversion/cli.py        |  3 +-
 src/calibre/ebooks/conversion/plumber.py    |  7 +++
 src/calibre/ebooks/conversion/preprocess.py | 12 ++++
 src/calibre/ebooks/textile/unsmarten.py     |  3 -
 src/calibre/ebooks/txt/markdownml.py        |  3 -
 src/calibre/ebooks/txt/output.py            |  4 --
 src/calibre/ebooks/txt/txtml.py             |  4 --
 src/calibre/ebooks/txt/unsmarten.py         | 18 ------
 src/calibre/gui2/convert/look_and_feel.py   |  5 +-
 src/calibre/gui2/convert/look_and_feel.ui   | 41 +++++++------
 src/calibre/utils/unsmarten.py              | 64 +++++++++++++++++++++
 11 files changed, 112 insertions(+), 52 deletions(-)
 delete mode 100644 src/calibre/ebooks/txt/unsmarten.py
 create mode 100644 src/calibre/utils/unsmarten.py

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index c605df0de4..ed332acac2 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
                       'font_size_mapping',
                       'line_height', 'minimum_line_height',
                       'linearize_tables',
-                      'extra_css', 'smarten_punctuation',
+                      'extra_css',
+                      'smarten_punctuation', 'unsmarten_punctuation',
                       'margin_top', 'margin_left', 'margin_right',
                       'margin_bottom', 'change_justification',
                       'insert_blank_line', 'insert_blank_line_size',
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index adff954e62..fefc08b19d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
             )
         ),
 
+OptionRecommendation(name='unsmarten_punctuation',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Convert fancy quotes, dashes and ellipsis to their '
+               'plain equivalents.'
+            )
+        ),
+
 OptionRecommendation(name='read_metadata_from_opf',
             recommended_value=None, level=OptionRecommendation.LOW,
             short_switch='m',
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 0f804cc208..d1ccd8a082 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -605,6 +605,9 @@ class HTMLPreProcessor(object):
 
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
+            
+        if getattr(self.extra_opts, 'unsmarten_punctuation', False):
+            html = self.unsmarten_punctuation(html)
 
         unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
         if unsupported_unicode_chars:
@@ -636,3 +639,12 @@ class HTMLPreProcessor(object):
         html = re.sub(r'\s--\s', u'\u2014', html)
         return substitute_entites(html)
 
+    def unsmarten_punctuation(self, html):
+        from calibre.utils.unsmarten import unsmarten_html
+        from calibre.ebooks.chardet import substitute_entites
+        from calibre.ebooks.conversion.utils import HeuristicProcessor
+        preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+        html = preprocessor.fix_nbsp_indents(html)
+        html = unsmarten_html(html)
+        return substitute_entites(html)
+
diff --git a/src/calibre/ebooks/textile/unsmarten.py b/src/calibre/ebooks/textile/unsmarten.py
index c31bb77c24..94127c5c39 100644
--- a/src/calibre/ebooks/textile/unsmarten.py
+++ b/src/calibre/ebooks/textile/unsmarten.py
@@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
 import re
 
 def unsmarten(txt):
-    from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
-    txt = txt_unsmarten(txt)
-    
     txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
     txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
     txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py
index 878633add3..79cfabf65e 100644
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@@ -15,7 +15,6 @@ from functools import partial
 from calibre.ebooks.htmlz.oeb2html import OEB2HTML
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
-from calibre.ebooks.txt.unsmarten import unsmarten
 
 class MarkdownMLizer(OEB2HTML):
 
@@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
         self.style_italic = False
 
         txt = self.mlize_spine(oeb_book)
-        if self.opts.unsmarten_punctuation:
-            txt = unsmarten(txt)
 
         # Do some tidying up
         txt = self.tidy_up(txt)
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 47ee734a04..d9c42eb1dc 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
                     '* plain: Produce plain text.\n'
                     '* markdown: Produce Markdown formatted text.\n'
                     '* textile: Produce Textile formatted text.')),
-        OptionRecommendation(name='unsmarten_punctuation',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Convert fancy quotes, dashes and ellipsis to their '
-            'plain equivalents.')),
         OptionRecommendation(name='keep_links',
             recommended_value=False, level=OptionRecommendation.LOW,
             help=_('Do not remove links within the document. This is only ' \
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index 5d2b03d98e..2320fbbbc7 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -12,8 +12,6 @@ import re
 
 from lxml import etree
 
-from calibre.ebooks.txt.unsmarten import unsmarten
-
 
 BLOCK_TAGS = [
     'div',
@@ -78,8 +76,6 @@ class TXTMLizer(object):
             output += '\n\n\n\n\n\n'
         output = u''.join(output)
         output = u'\n'.join(l.rstrip() for l in output.splitlines())
-        if self.opts.unsmarten_punctuation:
-            output = unsmarten(output)
         output = self.cleanup_text(output)
 
         return output
diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py
deleted file mode 100644
index 53f686c2fc..0000000000
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-def unsmarten(txt):
-    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
-    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
-    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
-
-    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
-    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
-    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
-
-    return txt
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index 9b008c0d6d..5ca7e1ea02 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
         Widget.__init__(self, parent,
                 ['change_justification', 'extra_css', 'base_font_size',
                     'font_size_mapping', 'line_height', 'minimum_line_height',
-                    'linearize_tables', 'smarten_punctuation',
+                    'smarten_punctuation', 'unsmarten_punctuation',
                     'disable_font_rescaling', 'insert_blank_line',
                     'remove_paragraph_spacing',
                     'remove_paragraph_spacing_indent_size',
                     'insert_blank_line_size',
                     'input_encoding',
-                    'asciiize', 'keep_ligatures']
+                    'asciiize', 'keep_ligatures',
+                    'linearize_tables']
                 )
         for val, text in [
                 ('original', _('Original')),
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index 0aa91e0f47..055d569212 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -7,7 +7,7 @@
     <x>0</x>
     <y>0</y>
     <width>642</width>
-    <height>500</height>
+    <height>522</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -84,7 +84,7 @@
         <string>...</string>
        </property>
        <property name="icon">
-        <iconset resource="../../../../resources/images.qrc">
+        <iconset>
          <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
        </property>
        <property name="iconSize">
@@ -194,13 +194,6 @@
    <item row="8" column="2" colspan="3">
     <widget class="QComboBox" name="opt_change_justification"/>
    </item>
-   <item row="9" column="0">
-    <widget class="QCheckBox" name="opt_linearize_tables">
-     <property name="text">
-      <string>&amp;Linearize tables</string>
-     </property>
-    </widget>
-   </item>
    <item row="9" column="1" colspan="4">
     <widget class="QCheckBox" name="opt_asciiize">
      <property name="text">
@@ -215,7 +208,7 @@
      </property>
     </widget>
    </item>
-   <item row="12" column="0" colspan="5">
+   <item row="13" column="0" colspan="5">
     <widget class="QGroupBox" name="groupBox">
      <property name="title">
       <string>Extra &amp;CSS</string>
@@ -240,13 +233,6 @@
      </property>
     </widget>
    </item>
-   <item row="10" column="0">
-    <widget class="QCheckBox" name="opt_smarten_punctuation">
-     <property name="text">
-      <string>Smarten &amp;punctuation</string>
-     </property>
-    </widget>
-   </item>
    <item row="6" column="3">
     <widget class="QLabel" name="label_4">
      <property name="text">
@@ -273,6 +259,27 @@
      </property>
     </widget>
    </item>
+   <item row="9" column="0">
+    <widget class="QCheckBox" name="opt_smarten_punctuation">
+     <property name="text">
+      <string>Smarten &amp;punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="0">
+    <widget class="QCheckBox" name="opt_unsmarten_punctuation">
+     <property name="text">
+      <string>&amp;UnSmarten punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="3">
+    <widget class="QCheckBox" name="opt_linearize_tables">
+     <property name="text">
+      <string>&amp;Linearize tables</string>
+     </property>
+    </widget>
+   </item>
   </layout>
  </widget>
  <customwidgets>
diff --git a/src/calibre/utils/unsmarten.py b/src/calibre/utils/unsmarten.py
new file mode 100644
index 0000000000..f37f9fb010
--- /dev/null
+++ b/src/calibre/utils/unsmarten.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from lxml import html as lhtml
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.oeb.base import barename
+
+def unsmarten_html(html):
+    def dump_text(elem):
+        text = []
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+        tags.append(tag)
+        # Turn the attributes into a string we can write with the tag.
+        at = ''
+        for k, v in attribs.items():
+            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
+        # Write the tag.
+        text.append('<%s%s>' % (tag, at))
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            # Don't modify text in pre tags.
+            if tag == 'pre':
+                text.append(elem.text)
+            else:
+                text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += dump_text(item)
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            text.append('</%s>' % t)
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
+        return text
+    
+    content = lhtml.fromstring(html)
+    html = dump_text(content)
+    html = ''.join(html)
+    
+    return html
+
+
+def unsmarten_text(txt):
+    txt = re.sub(u'&#8211;|&ndash;|–', r'--', txt) # en-dash
+    txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash
+    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
+
+    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
+    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
+
+    return txt
+