Change unsmarten option to be an oeb transform and make it handle more cases and use a simplier implmentation.

2025-11-04 03:27:00 -05:00 · 2011-09-06 19:07:10 -04:00 · 2011-09-06 19:07:10 -04:00 · 1ce4a97f63
commit 1ce4a97f63
parent ec448064aa
4 changed files with 29 additions and 47 deletions
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -1025,6 +1025,10 @@ OptionRecommendation(name='sr3_replace',
            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
            LinearizeTables()(self.oeb, self.opts)
        if self.opts.unsmarten_punctuation:
            from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
            UnsmartenPunctuation()(self.oeb, self.opts)
        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
                lineh=line_height,
                untable=self.output_plugin.file_type in ('mobi','lit'),
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -606,9 +606,6 @@ class HTMLPreProcessor(object):
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
        if getattr(self.extra_opts, 'unsmarten_punctuation', False):
            html = self.unsmarten_punctuation(html)
        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
            from calibre.utils.localization import get_udc
--- a/src/calibre/ebooks/oeb/transforms/unsmarten.py
+++ b/src/calibre/ebooks/oeb/transforms/unsmarten.py
@ -0,0 +1,25 @@
 # -*- coding: utf-8 -*-
 from __future__ import (unicode_literals, division, absolute_import, print_function)
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
 from calibre.utils.unsmarten import unsmarten_text
 class UnsmartenPunctuation(object):
    def unsmarten(self, root):
        for x in XPath('//h:*')(root):
            if not barename(x) == 'pre':
                if hasattr(x, 'text') and x.text:
                    x.text = unsmarten_text(x.text)
                if hasattr(x, 'tail') and x.tail:
                    x.tail = unsmarten_text(x.tail)
    def __call__(self, oeb, context):
        for x in oeb.manifest.items:
            if x.media_type in OEB_DOCS:
                self.unsmarten(x.data)
--- a/src/calibre/utils/unsmarten.py
+++ b/src/calibre/utils/unsmarten.py
@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en'
 import re
 from lxml import html as lhtml
 from calibre import prepare_string_for_xml
 from calibre.ebooks.oeb.base import barename
 def unsmarten_html(html):
    def dump_text(elem):
        text = []
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib
        tags.append(tag)
        # Turn the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
        # Write the tag.
        text.append('<%s%s>' % (tag, at))
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            # Don't modify text in pre tags.
            if tag == 'pre':
                text.append(elem.text)
            else:
                text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += dump_text(item)
        # Close all open tags.
        tags.reverse()
        for t in tags:
            text.append('</%s>' % t)
        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
        return text
    content = lhtml.fromstring(html)
    html = dump_text(content)
    html = ''.join(html)
    return html
 def unsmarten_text(txt):
    txt = re.sub(u'&#8211;|&ndash;|–', r'--', txt) # en-dash
    txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash