Make unsmarten punctuation a global option.

2025-07-09 03:04:10 -04:00 · 2011-09-05 12:24:26 -04:00 · 2011-09-05 12:24:26 -04:00 · ec448064aa
commit ec448064aa
parent 123991aea5
11 changed files with 112 additions and 52 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
                      'font_size_mapping',
                      'line_height', 'minimum_line_height',
                      'linearize_tables',
-                      'extra_css', 'smarten_punctuation',
+                      'extra_css',
+                      'smarten_punctuation', 'unsmarten_punctuation',
                      'margin_top', 'margin_left', 'margin_right',
                      'margin_bottom', 'change_justification',
                      'insert_blank_line', 'insert_blank_line_size',
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
            )
        ),

+OptionRecommendation(name='unsmarten_punctuation',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Convert fancy quotes, dashes and ellipsis to their '
+               'plain equivalents.'
+            )
+        ),
+
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -605,6 +605,9 @@ class HTMLPreProcessor(object):

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
+            
+        if getattr(self.extra_opts, 'unsmarten_punctuation', False):
+            html = self.unsmarten_punctuation(html)

        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
@ -636,3 +639,12 @@ class HTMLPreProcessor(object):
        html = re.sub(r'\s--\s', u'\u2014', html)
        return substitute_entites(html)

+    def unsmarten_punctuation(self, html):
+        from calibre.utils.unsmarten import unsmarten_html
+        from calibre.ebooks.chardet import substitute_entites
+        from calibre.ebooks.conversion.utils import HeuristicProcessor
+        preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+        html = preprocessor.fix_nbsp_indents(html)
+        html = unsmarten_html(html)
+        return substitute_entites(html)
+
--- a/src/calibre/ebooks/textile/unsmarten.py
+++ b/src/calibre/ebooks/textile/unsmarten.py
@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
 import re

 def unsmarten(txt):
-    from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
-    txt = txt_unsmarten(txt)
-    
    txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
    txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
    txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@ -15,7 +15,6 @@ from functools import partial
 from calibre.ebooks.htmlz.oeb2html import OEB2HTML
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
-from calibre.ebooks.txt.unsmarten import unsmarten

 class MarkdownMLizer(OEB2HTML):

@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
        self.style_italic = False

        txt = self.mlize_spine(oeb_book)
-        if self.opts.unsmarten_punctuation:
-            txt = unsmarten(txt)

        # Do some tidying up
        txt = self.tidy_up(txt)
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
                    '* plain: Produce plain text.\n'
                    '* markdown: Produce Markdown formatted text.\n'
                    '* textile: Produce Textile formatted text.')),
-        OptionRecommendation(name='unsmarten_punctuation',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Convert fancy quotes, dashes and ellipsis to their '
-            'plain equivalents.')),
        OptionRecommendation(name='keep_links',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Do not remove links within the document. This is only ' \
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -12,8 +12,6 @@ import re

 from lxml import etree

-from calibre.ebooks.txt.unsmarten import unsmarten
-

 BLOCK_TAGS = [
    'div',
@ -78,8 +76,6 @@ class TXTMLizer(object):
            output += '\n\n\n\n\n\n'
        output = u''.join(output)
        output = u'\n'.join(l.rstrip() for l in output.splitlines())
-        if self.opts.unsmarten_punctuation:
-            output = unsmarten(output)
        output = self.cleanup_text(output)

        return output
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ b/src/calibre/ebooks/txt/unsmarten.py
@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-def unsmarten(txt):
-    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
-    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
-    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
-
-    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
-    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
-    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
-
-    return txt
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
        Widget.__init__(self, parent,
                ['change_justification', 'extra_css', 'base_font_size',
                    'font_size_mapping', 'line_height', 'minimum_line_height',
-                    'linearize_tables', 'smarten_punctuation',
+                    'smarten_punctuation', 'unsmarten_punctuation',
                    'disable_font_rescaling', 'insert_blank_line',
                    'remove_paragraph_spacing',
                    'remove_paragraph_spacing_indent_size',
                    'insert_blank_line_size',
                    'input_encoding',
-                    'asciiize', 'keep_ligatures']
+                    'asciiize', 'keep_ligatures',
+                    'linearize_tables']
                )
        for val, text in [
                ('original', _('Original')),
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@ -7,7 +7,7 @@
    <x>0</x>
    <y>0</y>
    <width>642</width>
-    <height>500</height>
+    <height>522</height>
   </rect>
  </property>
  <property name="windowTitle">
@ -84,7 +84,7 @@
        <string>...</string>
       </property>
       <property name="icon">
-        <iconset resource="../../../../resources/images.qrc">
+        <iconset>
         <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
       </property>
       <property name="iconSize">
@ -194,13 +194,6 @@
   <item row="8" column="2" colspan="3">
    <widget class="QComboBox" name="opt_change_justification"/>
   </item>
-   <item row="9" column="0">
-    <widget class="QCheckBox" name="opt_linearize_tables">
-     <property name="text">
-      <string>&amp;Linearize tables</string>
-     </property>
-    </widget>
-   </item>
   <item row="9" column="1" colspan="4">
    <widget class="QCheckBox" name="opt_asciiize">
     <property name="text">
@ -215,7 +208,7 @@
     </property>
    </widget>
   </item>
-   <item row="12" column="0" colspan="5">
+   <item row="13" column="0" colspan="5">
    <widget class="QGroupBox" name="groupBox">
     <property name="title">
      <string>Extra &amp;CSS</string>
@ -240,13 +233,6 @@
     </property>
    </widget>
   </item>
-   <item row="10" column="0">
-    <widget class="QCheckBox" name="opt_smarten_punctuation">
-     <property name="text">
-      <string>Smarten &amp;punctuation</string>
-     </property>
-    </widget>
-   </item>
   <item row="6" column="3">
    <widget class="QLabel" name="label_4">
     <property name="text">
@ -273,6 +259,27 @@
     </property>
    </widget>
   </item>
+   <item row="9" column="0">
+    <widget class="QCheckBox" name="opt_smarten_punctuation">
+     <property name="text">
+      <string>Smarten &amp;punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="0">
+    <widget class="QCheckBox" name="opt_unsmarten_punctuation">
+     <property name="text">
+      <string>&amp;UnSmarten punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="3">
+    <widget class="QCheckBox" name="opt_linearize_tables">
+     <property name="text">
+      <string>&amp;Linearize tables</string>
+     </property>
+    </widget>
+   </item>
  </layout>
 </widget>
 <customwidgets>
--- a/src/calibre/utils/unsmarten.py
+++ b/src/calibre/utils/unsmarten.py
@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from lxml import html as lhtml
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.oeb.base import barename
+
+def unsmarten_html(html):
+    def dump_text(elem):
+        text = []
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+        tags.append(tag)
+        # Turn the attributes into a string we can write with the tag.
+        at = ''
+        for k, v in attribs.items():
+            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
+        # Write the tag.
+        text.append('<%s%s>' % (tag, at))
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            # Don't modify text in pre tags.
+            if tag == 'pre':
+                text.append(elem.text)
+            else:
+                text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += dump_text(item)
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            text.append('</%s>' % t)
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
+        return text
+    
+    content = lhtml.fromstring(html)
+    html = dump_text(content)
+    html = ''.join(html)
+    
+    return html
+
+
+def unsmarten_text(txt):
+    txt = re.sub(u'&#8211;|&ndash;|–', r'--', txt) # en-dash
+    txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash
+    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
+
+    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
+    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
+
+    return txt
+