Change unsmarten option to be an oeb transform and make it handle more cases and use a simplier implmentation.

This commit is contained in:
John Schember 2011-09-06 19:07:10 -04:00
parent ec448064aa
commit 1ce4a97f63
4 changed files with 29 additions and 47 deletions

View File

@ -1024,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
self.output_plugin.file_type not in ('mobi', 'lrf'):
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
if self.opts.unsmarten_punctuation:
from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
UnsmartenPunctuation()(self.oeb, self.opts)
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=line_height,

View File

@ -605,9 +605,6 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
if getattr(self.extra_opts, 'unsmarten_punctuation', False):
html = self.unsmarten_punctuation(html)
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
from calibre.utils.unsmarten import unsmarten_text
class UnsmartenPunctuation(object):
def unsmarten(self, root):
for x in XPath('//h:*')(root):
if not barename(x) == 'pre':
if hasattr(x, 'text') and x.text:
x.text = unsmarten_text(x.text)
if hasattr(x, 'tail') and x.tail:
x.tail = unsmarten_text(x.tail)
def __call__(self, oeb, context):
for x in oeb.manifest.items:
if x.media_type in OEB_DOCS:
self.unsmarten(x.data)

View File

@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en'
import re
from lxml import html as lhtml
from calibre import prepare_string_for_xml
from calibre.ebooks.oeb.base import barename
def unsmarten_html(html):
def dump_text(elem):
text = []
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
tags.append(tag)
# Turn the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
# Write the tag.
text.append('<%s%s>' % (tag, at))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
# Don't modify text in pre tags.
if tag == 'pre':
text.append(elem.text)
else:
text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
# Recurse down into tags within the tag we are in.
for item in elem:
text += dump_text(item)
# Close all open tags.
tags.reverse()
for t in tags:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
return text
content = lhtml.fromstring(html)
html = dump_text(content)
html = ''.join(html)
return html
def unsmarten_text(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'--', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'---', txt) # em-dash