Conversion: Add option to unsmarten puctuation under Look & Feel

This commit is contained in:
Kovid Goyal 2011-09-06 21:20:38 -06:00
commit c5c9738f63
13 changed files with 132 additions and 64 deletions

View File

@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
'font_size_mapping',
'line_height', 'minimum_line_height',
'linearize_tables',
'extra_css', 'smarten_punctuation',
'extra_css',
'smarten_punctuation', 'unsmarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'insert_blank_line_size',

View File

@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
)
),
OptionRecommendation(name='unsmarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert fancy quotes, dashes and ellipsis to their '
'plain equivalents.'
)
),
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',
@ -1017,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
self.output_plugin.file_type not in ('mobi', 'lrf'):
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
LinearizeTables()(self.oeb, self.opts)
if self.opts.unsmarten_punctuation:
from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
UnsmartenPunctuation()(self.oeb, self.opts)
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=line_height,

View File

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
from calibre.utils.unsmarten import unsmarten_text
class UnsmartenPunctuation(object):
def __init__(self):
self.html_tags = XPath('descendant::h:*')
def unsmarten(self, root):
for x in self.html_tags(root):
if not barename(x) == 'pre':
if getattr(x, 'text', None):
x.text = unsmarten_text(x.text)
if getattr(x, 'tail', None) and x.tail:
x.tail = unsmarten_text(x.tail)
def __call__(self, oeb, context):
bx = XPath('//h:body')
for x in oeb.manifest.items:
if x.media_type in OEB_DOCS:
for body in bx(x.data):
self.unsmarten(body)

View File

@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
import re
def unsmarten(txt):
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
txt = txt_unsmarten(txt)
txt = re.sub(u'&#162;|&cent;|¢', r'{c\}', txt) # cent
txt = re.sub(u'&#163;|&pound;|£', r'{L-}', txt) # pound
txt = re.sub(u'&#165;|&yen;|¥', r'{Y=}', txt) # yen

View File

@ -15,7 +15,6 @@ from functools import partial
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.txt.unsmarten import unsmarten
class MarkdownMLizer(OEB2HTML):
@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
self.style_italic = False
txt = self.mlize_spine(oeb_book)
if self.opts.unsmarten_punctuation:
txt = unsmarten(txt)
# Do some tidying up
txt = self.tidy_up(txt)

View File

@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
'* plain: Produce plain text.\n'
'* markdown: Produce Markdown formatted text.\n'
'* textile: Produce Textile formatted text.')),
OptionRecommendation(name='unsmarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert fancy quotes, dashes and ellipsis to their '
'plain equivalents.')),
OptionRecommendation(name='keep_links',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove links within the document. This is only ' \

View File

@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML):
for i in self.our_ids:
if i not in self.our_links:
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
# Remove obvious non-needed escaping, add sub/sup-script ones
text = check_escaping(text, ['\*', '_', '\*'])
# escape the super/sub-scripts if needed
@ -189,7 +189,7 @@ class TextileMLizer(OEB2HTML):
emright = int(round(right / stylizer.profile.fbase))
if emright >= 1:
txt += ')' * emright
return txt
def check_id_tag(self, attribs):
@ -235,7 +235,7 @@ class TextileMLizer(OEB2HTML):
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
@ -246,7 +246,7 @@ class TextileMLizer(OEB2HTML):
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
if ems >= 1:
text.append(u'\n\n\xa0' * ems)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
if tag == 'div':
tag = 'p'
@ -432,7 +432,7 @@ class TextileMLizer(OEB2HTML):
'span', 'table', 'tr', 'td'):
if not self.in_a_link:
text.append(self.check_styles(style))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
txt = elem.text

View File

@ -12,8 +12,6 @@ import re
from lxml import etree
from calibre.ebooks.txt.unsmarten import unsmarten
BLOCK_TAGS = [
'div',
@ -78,8 +76,6 @@ class TXTMLizer(object):
output += '\n\n\n\n\n\n'
output = u''.join(output)
output = u'\n'.join(l.rstrip() for l in output.splitlines())
if self.opts.unsmarten_punctuation:
output = unsmarten(output)
output = self.cleanup_text(output)
return output

View File

@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
__docformat__ = 'restructuredtext en'
import re
def unsmarten(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'-', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt) # double quote
txt = re.sub(u'(["\'‘“]|\s)', r"\1{'/}", txt) # apostrophe
txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|||', r"'", txt) # single quote
return txt

View File

@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['change_justification', 'extra_css', 'base_font_size',
'font_size_mapping', 'line_height', 'minimum_line_height',
'linearize_tables', 'smarten_punctuation',
'smarten_punctuation', 'unsmarten_punctuation',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing',
'remove_paragraph_spacing_indent_size',
'insert_blank_line_size',
'input_encoding',
'asciiize', 'keep_ligatures']
'asciiize', 'keep_ligatures',
'linearize_tables']
)
for val, text in [
('original', _('Original')),

View File

@ -7,7 +7,7 @@
<x>0</x>
<y>0</y>
<width>642</width>
<height>500</height>
<height>522</height>
</rect>
</property>
<property name="windowTitle">
@ -84,7 +84,7 @@
<string>...</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<iconset>
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
</property>
<property name="iconSize">
@ -194,13 +194,6 @@
<item row="8" column="2" colspan="3">
<widget class="QComboBox" name="opt_change_justification"/>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="opt_linearize_tables">
<property name="text">
<string>&amp;Linearize tables</string>
</property>
</widget>
</item>
<item row="9" column="1" colspan="4">
<widget class="QCheckBox" name="opt_asciiize">
<property name="text">
@ -215,7 +208,7 @@
</property>
</widget>
</item>
<item row="12" column="0" colspan="5">
<item row="13" column="0" colspan="5">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Extra &amp;CSS</string>
@ -240,13 +233,6 @@
</property>
</widget>
</item>
<item row="10" column="0">
<widget class="QCheckBox" name="opt_smarten_punctuation">
<property name="text">
<string>Smarten &amp;punctuation</string>
</property>
</widget>
</item>
<item row="6" column="3">
<widget class="QLabel" name="label_4">
<property name="text">
@ -273,6 +259,27 @@
</property>
</widget>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="opt_smarten_punctuation">
<property name="text">
<string>Smarten &amp;punctuation</string>
</property>
</widget>
</item>
<item row="10" column="0">
<widget class="QCheckBox" name="opt_unsmarten_punctuation">
<property name="text">
<string>&amp;UnSmarten punctuation</string>
</property>
</widget>
</item>
<item row="10" column="3">
<widget class="QCheckBox" name="opt_linearize_tables">
<property name="text">
<string>&amp;Linearize tables</string>
</property>
</widget>
</item>
</layout>
</widget>
<customwidgets>

View File

@ -7,26 +7,32 @@ import re
from UserDict import UserDict
class MReplace(UserDict):
def __init__(self, dict = None):
UserDict.__init__(self, dict)
def __init__(self, data=None, case_sensitive=True):
UserDict.__init__(self, data)
self.re = None
self.regex = None
self.case_sensitive = case_sensitive
self.compile_regex()
def compile_regex(self):
def compile_regex(self):
if len(self.data) > 0:
keys = sorted(self.data.keys(), key=len)
keys.reverse()
tmp = "(%s)" % "|".join(map(re.escape, keys))
if self.re != tmp:
self.re = tmp
self.regex = re.compile(self.re)
if self.case_sensitive:
self.regex = re.compile(self.re)
else:
self.regex = re.compile(self.re, re.I)
def __call__(self, mo):
def __call__(self, mo):
return self[mo.string[mo.start():mo.end()]]
def mreplace(self, text):
def mreplace(self, text):
#Replace without regex compile
if len(self.data) < 1 or self.re is None:
return text
return self.regex.sub(self, text)
return self.regex.sub(self, text)

View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.utils.mreplace import MReplace
_mreplace = MReplace({
'&#8211;': '--',
'&ndash;': '--',
'': '--',
'&#8212;': '---',
'&mdash;': '---',
'': '---',
'&#8230;': '...',
'&hellip;': '...',
'': '...',
'&#8220;': '"',
'&#8221;': '"',
'&#8243;': '"',
'&ldquo;': '"',
'&rdquo;': '"',
'&Prime;': '"',
'':'"',
'':'"',
'':'"',
'&#8216;':"'",
'&#8217;':"'",
'&#8242;':"'",
'&lsquo;':"'",
'&rsquo;':"'",
'&prime;':"'",
'':"'",
'':"'",
'':"'",
}
)
unsmarten_text = _mreplace.mreplace