diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index fefc08b19d..3e5313eb96 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -1024,6 +1024,10 @@ OptionRecommendation(name='sr3_replace', self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) + + if self.opts.unsmarten_punctuation: + from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation + UnsmartenPunctuation()(self.oeb, self.opts) flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=line_height, diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d1ccd8a082..7ba4217f7d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -605,9 +605,6 @@ class HTMLPreProcessor(object): if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) - - if getattr(self.extra_opts, 'unsmarten_punctuation', False): - html = self.unsmarten_punctuation(html) unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars if unsupported_unicode_chars: diff --git a/src/calibre/ebooks/oeb/transforms/unsmarten.py b/src/calibre/ebooks/oeb/transforms/unsmarten.py new file mode 100644 index 0000000000..a83fa6f39f --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/unsmarten.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename +from calibre.utils.unsmarten import unsmarten_text + +class UnsmartenPunctuation(object): + + def unsmarten(self, root): + for x in XPath('//h:*')(root): + if not barename(x) == 'pre': + if hasattr(x, 'text') and x.text: + x.text = unsmarten_text(x.text) + if hasattr(x, 'tail') and x.tail: + x.tail = unsmarten_text(x.tail) + + def __call__(self, oeb, context): + for x in oeb.manifest.items: + if x.media_type in OEB_DOCS: + self.unsmarten(x.data) diff --git a/src/calibre/utils/unsmarten.py b/src/calibre/utils/unsmarten.py index f37f9fb010..b9f9175599 100644 --- a/src/calibre/utils/unsmarten.py +++ b/src/calibre/utils/unsmarten.py @@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en' import re -from lxml import html as lhtml - -from calibre import prepare_string_for_xml -from calibre.ebooks.oeb.base import barename - -def unsmarten_html(html): - def dump_text(elem): - text = [] - tags = [] - tag = barename(elem.tag) - attribs = elem.attrib - tags.append(tag) - # Turn the attributes into a string we can write with the tag. - at = '' - for k, v in attribs.items(): - at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) - # Write the tag. - text.append('<%s%s>' % (tag, at)) - # Process tags that contain text. - if hasattr(elem, 'text') and elem.text: - # Don't modify text in pre tags. - if tag == 'pre': - text.append(elem.text) - else: - text.append(prepare_string_for_xml(unsmarten_text(elem.text))) - # Recurse down into tags within the tag we are in. - for item in elem: - text += dump_text(item) - # Close all open tags. - tags.reverse() - for t in tags: - text.append('' % t) - # Add the text that is outside of the tag. - if hasattr(elem, 'tail') and elem.tail: - text.append(prepare_string_for_xml(unsmarten_text(elem.tail))) - return text - - content = lhtml.fromstring(html) - html = dump_text(content) - html = ''.join(html) - - return html - - def unsmarten_text(txt): txt = re.sub(u'–|–|–', r'--', txt) # en-dash txt = re.sub(u'—|—|—', r'---', txt) # em-dash