Change unsmarten option to be an oeb transform and make it handle more cases and use a simplier implmentation.

2025-06-23 15:30:45 -04:00 · 2011-09-06 19:07:10 -04:00 · 2011-09-06 19:07:10 -04:00 · 1ce4a97f63
commit 1ce4a97f63
parent ec448064aa
4 changed files with 29 additions and 47 deletions
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -1024,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
                self.output_plugin.file_type not in ('mobi', 'lrf'):
            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
            LinearizeTables()(self.oeb, self.opts)
+            
+        if self.opts.unsmarten_punctuation:
+            from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
+            UnsmartenPunctuation()(self.oeb, self.opts)

        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
                lineh=line_height,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -605,9 +605,6 @@ class HTMLPreProcessor(object):

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
-            
-        if getattr(self.extra_opts, 'unsmarten_punctuation', False):
-            html = self.unsmarten_punctuation(html)

        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
--- a/src/calibre/ebooks/oeb/transforms/unsmarten.py
+++ b/src/calibre/ebooks/oeb/transforms/unsmarten.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
+from calibre.utils.unsmarten import unsmarten_text
+
+class UnsmartenPunctuation(object):
+    
+    def unsmarten(self, root):
+        for x in XPath('//h:*')(root):
+            if not barename(x) == 'pre':
+                if hasattr(x, 'text') and x.text:
+                    x.text = unsmarten_text(x.text)
+                if hasattr(x, 'tail') and x.tail:
+                    x.tail = unsmarten_text(x.tail)
+
+    def __call__(self, oeb, context):
+        for x in oeb.manifest.items:
+            if x.media_type in OEB_DOCS:
+                self.unsmarten(x.data)
--- a/src/calibre/utils/unsmarten.py
+++ b/src/calibre/utils/unsmarten.py
@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en'

 import re

-from lxml import html as lhtml
-
-from calibre import prepare_string_for_xml
-from calibre.ebooks.oeb.base import barename
-
-def unsmarten_html(html):
-    def dump_text(elem):
-        text = []
-        tags = []
-        tag = barename(elem.tag)
-        attribs = elem.attrib
-        tags.append(tag)
-        # Turn the attributes into a string we can write with the tag.
-        at = ''
-        for k, v in attribs.items():
-            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
-        # Write the tag.
-        text.append('<%s%s>' % (tag, at))
-        # Process tags that contain text.
-        if hasattr(elem, 'text') and elem.text:
-            # Don't modify text in pre tags.
-            if tag == 'pre':
-                text.append(elem.text)
-            else:
-                text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
-        # Recurse down into tags within the tag we are in.
-        for item in elem:
-            text += dump_text(item)
-        # Close all open tags.
-        tags.reverse()
-        for t in tags:
-            text.append('</%s>' % t)
-        # Add the text that is outside of the tag.
-        if hasattr(elem, 'tail') and elem.tail:
-            text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
-        return text
-    
-    content = lhtml.fromstring(html)
-    html = dump_text(content)
-    html = ''.join(html)
-    
-    return html
-
-
 def unsmarten_text(txt):
    txt = re.sub(u'&#8211;|&ndash;|–', r'--', txt) # en-dash
    txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash