mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Change unsmarten option to be an oeb transform and make it handle more cases and use a simplier implmentation.
This commit is contained in:
parent
ec448064aa
commit
1ce4a97f63
@ -1024,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
||||
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
||||
LinearizeTables()(self.oeb, self.opts)
|
||||
|
||||
if self.opts.unsmarten_punctuation:
|
||||
from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
|
||||
UnsmartenPunctuation()(self.oeb, self.opts)
|
||||
|
||||
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
||||
lineh=line_height,
|
||||
|
@ -605,9 +605,6 @@ class HTMLPreProcessor(object):
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
|
||||
if getattr(self.extra_opts, 'unsmarten_punctuation', False):
|
||||
html = self.unsmarten_punctuation(html)
|
||||
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
if unsupported_unicode_chars:
|
||||
|
25
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
25
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
|
||||
from calibre.utils.unsmarten import unsmarten_text
|
||||
|
||||
class UnsmartenPunctuation(object):
|
||||
|
||||
def unsmarten(self, root):
|
||||
for x in XPath('//h:*')(root):
|
||||
if not barename(x) == 'pre':
|
||||
if hasattr(x, 'text') and x.text:
|
||||
x.text = unsmarten_text(x.text)
|
||||
if hasattr(x, 'tail') and x.tail:
|
||||
x.tail = unsmarten_text(x.tail)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
for x in oeb.manifest.items:
|
||||
if x.media_type in OEB_DOCS:
|
||||
self.unsmarten(x.data)
|
@ -8,50 +8,6 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml import html as lhtml
|
||||
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.ebooks.oeb.base import barename
|
||||
|
||||
def unsmarten_html(html):
|
||||
def dump_text(elem):
|
||||
text = []
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
tags.append(tag)
|
||||
# Turn the attributes into a string we can write with the tag.
|
||||
at = ''
|
||||
for k, v in attribs.items():
|
||||
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
|
||||
# Write the tag.
|
||||
text.append('<%s%s>' % (tag, at))
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
# Don't modify text in pre tags.
|
||||
if tag == 'pre':
|
||||
text.append(elem.text)
|
||||
else:
|
||||
text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += dump_text(item)
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
text.append('</%s>' % t)
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
|
||||
return text
|
||||
|
||||
content = lhtml.fromstring(html)
|
||||
html = dump_text(content)
|
||||
html = ''.join(html)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def unsmarten_text(txt):
|
||||
txt = re.sub(u'–|–|–', r'--', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'---', txt) # em-dash
|
||||
|
Loading…
x
Reference in New Issue
Block a user