Conversion: Add option to unsmarten puctuation under Look & Feel

2025-07-09 03:04:10 -04:00 · 2011-09-06 21:20:38 -06:00 · 2011-09-06 21:20:38 -06:00 · c5c9738f63
commit c5c9738f63
parent 994a31e686 1ce4a97f63
13 changed files with 132 additions and 64 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
                      'font_size_mapping',
                      'line_height', 'minimum_line_height',
                      'linearize_tables',
-                      'extra_css', 'smarten_punctuation',
+                      'extra_css',
+                      'smarten_punctuation', 'unsmarten_punctuation',
                      'margin_top', 'margin_left', 'margin_right',
                      'margin_bottom', 'change_justification',
                      'insert_blank_line', 'insert_blank_line_size',
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
            )
        ),

+OptionRecommendation(name='unsmarten_punctuation',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Convert fancy quotes, dashes and ellipsis to their '
+               'plain equivalents.'
+            )
+        ),
+
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
@ -1017,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
                self.output_plugin.file_type not in ('mobi', 'lrf'):
            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
            LinearizeTables()(self.oeb, self.opts)
+            
+        if self.opts.unsmarten_punctuation:
+            from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
+            UnsmartenPunctuation()(self.oeb, self.opts)

        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
                lineh=line_height,
--- a/src/calibre/ebooks/oeb/transforms/unsmarten.py
+++ b/src/calibre/ebooks/oeb/transforms/unsmarten.py
@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
+from calibre.utils.unsmarten import unsmarten_text
+
+class UnsmartenPunctuation(object):
+
+    def __init__(self):
+        self.html_tags = XPath('descendant::h:*')
+
+    def unsmarten(self, root):
+        for x in self.html_tags(root):
+            if not barename(x) == 'pre':
+                if getattr(x, 'text', None):
+                    x.text = unsmarten_text(x.text)
+                if getattr(x, 'tail', None) and x.tail:
+                    x.tail = unsmarten_text(x.tail)
+
+    def __call__(self, oeb, context):
+        bx = XPath('//h:body')
+        for x in oeb.manifest.items:
+            if x.media_type in OEB_DOCS:
+                for body in bx(x.data):
+                    self.unsmarten(body)
+
--- a/src/calibre/ebooks/textile/unsmarten.py
+++ b/src/calibre/ebooks/textile/unsmarten.py
@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
 import re

 def unsmarten(txt):
-    from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
-    txt = txt_unsmarten(txt)
-    
    txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
    txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
    txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@ -15,7 +15,6 @@ from functools import partial
 from calibre.ebooks.htmlz.oeb2html import OEB2HTML
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
-from calibre.ebooks.txt.unsmarten import unsmarten

 class MarkdownMLizer(OEB2HTML):

@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
        self.style_italic = False

        txt = self.mlize_spine(oeb_book)
-        if self.opts.unsmarten_punctuation:
-            txt = unsmarten(txt)

        # Do some tidying up
        txt = self.tidy_up(txt)
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
                    '* plain: Produce plain text.\n'
                    '* markdown: Produce Markdown formatted text.\n'
                    '* textile: Produce Textile formatted text.')),
-        OptionRecommendation(name='unsmarten_punctuation',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Convert fancy quotes, dashes and ellipsis to their '
-            'plain equivalents.')),
        OptionRecommendation(name='keep_links',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Do not remove links within the document. This is only ' \
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML):
            for i in self.our_ids:
                if i not in self.our_links:
                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
-                    
+
        # Remove obvious non-needed escaping, add sub/sup-script ones
        text = check_escaping(text, ['\*', '_', '\*'])
        # escape the super/sub-scripts if needed
@ -189,7 +189,7 @@ class TextileMLizer(OEB2HTML):
        emright = int(round(right / stylizer.profile.fbase))
        if emright >= 1:
            txt += ')' * emright
-            
+
        return txt

    def check_id_tag(self, attribs):
@ -235,7 +235,7 @@ class TextileMLizer(OEB2HTML):
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib
-        
+
        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
@ -246,7 +246,7 @@ class TextileMLizer(OEB2HTML):
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)
-            
+
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            if tag == 'div':
                tag = 'p'
@ -432,7 +432,7 @@ class TextileMLizer(OEB2HTML):
                'span', 'table', 'tr', 'td'):
            if not self.in_a_link:
                text.append(self.check_styles(style))
-        
+
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -12,8 +12,6 @@ import re

 from lxml import etree

-from calibre.ebooks.txt.unsmarten import unsmarten
-

 BLOCK_TAGS = [
    'div',
@ -78,8 +76,6 @@ class TXTMLizer(object):
            output += '\n\n\n\n\n\n'
        output = u''.join(output)
        output = u'\n'.join(l.rstrip() for l in output.splitlines())
-        if self.opts.unsmarten_punctuation:
-            output = unsmarten(output)
        output = self.cleanup_text(output)

        return output
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ b/src/calibre/ebooks/txt/unsmarten.py
@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-def unsmarten(txt):
-    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
-    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
-    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
-
-    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
-    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
-    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
-
-    return txt
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
        Widget.__init__(self, parent,
                ['change_justification', 'extra_css', 'base_font_size',
                    'font_size_mapping', 'line_height', 'minimum_line_height',
-                    'linearize_tables', 'smarten_punctuation',
+                    'smarten_punctuation', 'unsmarten_punctuation',
                    'disable_font_rescaling', 'insert_blank_line',
                    'remove_paragraph_spacing',
                    'remove_paragraph_spacing_indent_size',
                    'insert_blank_line_size',
                    'input_encoding',
-                    'asciiize', 'keep_ligatures']
+                    'asciiize', 'keep_ligatures',
+                    'linearize_tables']
                )
        for val, text in [
                ('original', _('Original')),
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@ -7,7 +7,7 @@
    <x>0</x>
    <y>0</y>
    <width>642</width>
-    <height>500</height>
+    <height>522</height>
   </rect>
  </property>
  <property name="windowTitle">
@ -84,7 +84,7 @@
        <string>...</string>
       </property>
       <property name="icon">
-        <iconset resource="../../../../resources/images.qrc">
+        <iconset>
         <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
       </property>
       <property name="iconSize">
@ -194,13 +194,6 @@
   <item row="8" column="2" colspan="3">
    <widget class="QComboBox" name="opt_change_justification"/>
   </item>
-   <item row="9" column="0">
-    <widget class="QCheckBox" name="opt_linearize_tables">
-     <property name="text">
-      <string>&amp;Linearize tables</string>
-     </property>
-    </widget>
-   </item>
   <item row="9" column="1" colspan="4">
    <widget class="QCheckBox" name="opt_asciiize">
     <property name="text">
@ -215,7 +208,7 @@
     </property>
    </widget>
   </item>
-   <item row="12" column="0" colspan="5">
+   <item row="13" column="0" colspan="5">
    <widget class="QGroupBox" name="groupBox">
     <property name="title">
      <string>Extra &amp;CSS</string>
@ -240,13 +233,6 @@
     </property>
    </widget>
   </item>
-   <item row="10" column="0">
-    <widget class="QCheckBox" name="opt_smarten_punctuation">
-     <property name="text">
-      <string>Smarten &amp;punctuation</string>
-     </property>
-    </widget>
-   </item>
   <item row="6" column="3">
    <widget class="QLabel" name="label_4">
     <property name="text">
@ -273,6 +259,27 @@
     </property>
    </widget>
   </item>
+   <item row="9" column="0">
+    <widget class="QCheckBox" name="opt_smarten_punctuation">
+     <property name="text">
+      <string>Smarten &amp;punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="0">
+    <widget class="QCheckBox" name="opt_unsmarten_punctuation">
+     <property name="text">
+      <string>&amp;UnSmarten punctuation</string>
+     </property>
+    </widget>
+   </item>
+   <item row="10" column="3">
+    <widget class="QCheckBox" name="opt_linearize_tables">
+     <property name="text">
+      <string>&amp;Linearize tables</string>
+     </property>
+    </widget>
+   </item>
  </layout>
 </widget>
 <customwidgets>
--- a/src/calibre/utils/mreplace.py
+++ b/src/calibre/utils/mreplace.py
@ -7,26 +7,32 @@ import re
 from UserDict import UserDict

 class MReplace(UserDict):
-    def __init__(self, dict = None):
-        UserDict.__init__(self, dict)
+
+    def __init__(self, data=None, case_sensitive=True):
+        UserDict.__init__(self, data)
        self.re = None
        self.regex = None
+        self.case_sensitive = case_sensitive
        self.compile_regex()

-    def compile_regex(self): 
+    def compile_regex(self):
        if len(self.data) > 0:
            keys = sorted(self.data.keys(), key=len)
            keys.reverse()
            tmp = "(%s)" % "|".join(map(re.escape, keys))
            if self.re != tmp:
                self.re = tmp
-                self.regex = re.compile(self.re)
+                if self.case_sensitive:
+                    self.regex = re.compile(self.re)
+                else:
+                    self.regex = re.compile(self.re, re.I)

-    def __call__(self, mo): 
+    def __call__(self, mo):
        return self[mo.string[mo.start():mo.end()]]

-    def mreplace(self, text): 
+    def mreplace(self, text):
        #Replace without regex compile
        if len(self.data) < 1 or self.re is None:
            return text
-        return self.regex.sub(self, text)
+        return self.regex.sub(self, text)
+
--- a/src/calibre/utils/unsmarten.py
+++ b/src/calibre/utils/unsmarten.py
@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.utils.mreplace import MReplace
+
+_mreplace = MReplace({
+        '&#8211;': '--',
+        '&ndash;': '--',
+        '–': '--',
+        '&#8212;': '---',
+        '&mdash;': '---',
+        '—': '---',
+        '&#8230;': '...',
+        '&hellip;': '...',
+        '…': '...',
+        '&#8220;': '"',
+        '&#8221;': '"',
+        '&#8243;': '"',
+        '&ldquo;': '"',
+        '&rdquo;': '"',
+        '&Prime;': '"',
+        '“':'"',
+        '”':'"',
+        '″':'"',
+        '&#8216;':"'",
+        '&#8217;':"'",
+        '&#8242;':"'",
+        '&lsquo;':"'",
+        '&rsquo;':"'",
+        '&prime;':"'",
+        '‘':"'",
+        '’':"'",
+        '′':"'",
+}
+)
+unsmarten_text = _mreplace.mreplace
+
+