mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion: Add option to unsmarten puctuation under Look & Feel
This commit is contained in:
commit
c5c9738f63
@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
|
||||
'font_size_mapping',
|
||||
'line_height', 'minimum_line_height',
|
||||
'linearize_tables',
|
||||
'extra_css', 'smarten_punctuation',
|
||||
'extra_css',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'change_justification',
|
||||
'insert_blank_line', 'insert_blank_line_size',
|
||||
|
@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='unsmarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||
'plain equivalents.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
short_switch='m',
|
||||
@ -1017,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
||||
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
||||
LinearizeTables()(self.oeb, self.opts)
|
||||
|
||||
if self.opts.unsmarten_punctuation:
|
||||
from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
|
||||
UnsmartenPunctuation()(self.oeb, self.opts)
|
||||
|
||||
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
||||
lineh=line_height,
|
||||
|
31
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
31
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
@ -0,0 +1,31 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
|
||||
from calibre.utils.unsmarten import unsmarten_text
|
||||
|
||||
class UnsmartenPunctuation(object):
|
||||
|
||||
def __init__(self):
|
||||
self.html_tags = XPath('descendant::h:*')
|
||||
|
||||
def unsmarten(self, root):
|
||||
for x in self.html_tags(root):
|
||||
if not barename(x) == 'pre':
|
||||
if getattr(x, 'text', None):
|
||||
x.text = unsmarten_text(x.text)
|
||||
if getattr(x, 'tail', None) and x.tail:
|
||||
x.tail = unsmarten_text(x.tail)
|
||||
|
||||
def __call__(self, oeb, context):
|
||||
bx = XPath('//h:body')
|
||||
for x in oeb.manifest.items:
|
||||
if x.media_type in OEB_DOCS:
|
||||
for body in bx(x.data):
|
||||
self.unsmarten(body)
|
||||
|
@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
|
||||
txt = txt_unsmarten(txt)
|
||||
|
||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
|
@ -15,7 +15,6 @@ from functools import partial
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
|
||||
class MarkdownMLizer(OEB2HTML):
|
||||
|
||||
@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
|
||||
self.style_italic = False
|
||||
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
if self.opts.unsmarten_punctuation:
|
||||
txt = unsmarten(txt)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
|
||||
'* plain: Produce plain text.\n'
|
||||
'* markdown: Produce Markdown formatted text.\n'
|
||||
'* textile: Produce Textile formatted text.')),
|
||||
OptionRecommendation(name='unsmarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||
'plain equivalents.')),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only ' \
|
||||
|
@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML):
|
||||
for i in self.our_ids:
|
||||
if i not in self.our_links:
|
||||
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
||||
|
||||
|
||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||
text = check_escaping(text, ['\*', '_', '\*'])
|
||||
# escape the super/sub-scripts if needed
|
||||
@ -189,7 +189,7 @@ class TextileMLizer(OEB2HTML):
|
||||
emright = int(round(right / stylizer.profile.fbase))
|
||||
if emright >= 1:
|
||||
txt += ')' * emright
|
||||
|
||||
|
||||
return txt
|
||||
|
||||
def check_id_tag(self, attribs):
|
||||
@ -235,7 +235,7 @@ class TextileMLizer(OEB2HTML):
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
|
||||
|
||||
# Ignore anything that is set to not be displayed.
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
@ -246,7 +246,7 @@ class TextileMLizer(OEB2HTML):
|
||||
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n\xa0' * ems)
|
||||
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||
if tag == 'div':
|
||||
tag = 'p'
|
||||
@ -432,7 +432,7 @@ class TextileMLizer(OEB2HTML):
|
||||
'span', 'table', 'tr', 'td'):
|
||||
if not self.in_a_link:
|
||||
text.append(self.check_styles(style))
|
||||
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
|
@ -12,8 +12,6 @@ import re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'div',
|
||||
@ -78,8 +76,6 @@ class TXTMLizer(object):
|
||||
output += '\n\n\n\n\n\n'
|
||||
output = u''.join(output)
|
||||
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||
if self.opts.unsmarten_punctuation:
|
||||
output = unsmarten(output)
|
||||
output = self.cleanup_text(output)
|
||||
|
||||
return output
|
||||
|
@ -1,18 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||
|
||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||
|
||||
return txt
|
@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
|
||||
Widget.__init__(self, parent,
|
||||
['change_justification', 'extra_css', 'base_font_size',
|
||||
'font_size_mapping', 'line_height', 'minimum_line_height',
|
||||
'linearize_tables', 'smarten_punctuation',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'disable_font_rescaling', 'insert_blank_line',
|
||||
'remove_paragraph_spacing',
|
||||
'remove_paragraph_spacing_indent_size',
|
||||
'insert_blank_line_size',
|
||||
'input_encoding',
|
||||
'asciiize', 'keep_ligatures']
|
||||
'asciiize', 'keep_ligatures',
|
||||
'linearize_tables']
|
||||
)
|
||||
for val, text in [
|
||||
('original', _('Original')),
|
||||
|
@ -7,7 +7,7 @@
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>642</width>
|
||||
<height>500</height>
|
||||
<height>522</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
@ -84,7 +84,7 @@
|
||||
<string>...</string>
|
||||
</property>
|
||||
<property name="icon">
|
||||
<iconset resource="../../../../resources/images.qrc">
|
||||
<iconset>
|
||||
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
||||
</property>
|
||||
<property name="iconSize">
|
||||
@ -194,13 +194,6 @@
|
||||
<item row="8" column="2" colspan="3">
|
||||
<widget class="QComboBox" name="opt_change_justification"/>
|
||||
</item>
|
||||
<item row="9" column="0">
|
||||
<widget class="QCheckBox" name="opt_linearize_tables">
|
||||
<property name="text">
|
||||
<string>&Linearize tables</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="9" column="1" colspan="4">
|
||||
<widget class="QCheckBox" name="opt_asciiize">
|
||||
<property name="text">
|
||||
@ -215,7 +208,7 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="12" column="0" colspan="5">
|
||||
<item row="13" column="0" colspan="5">
|
||||
<widget class="QGroupBox" name="groupBox">
|
||||
<property name="title">
|
||||
<string>Extra &CSS</string>
|
||||
@ -240,13 +233,6 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="0">
|
||||
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||
<property name="text">
|
||||
<string>Smarten &punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="3">
|
||||
<widget class="QLabel" name="label_4">
|
||||
<property name="text">
|
||||
@ -273,6 +259,27 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="9" column="0">
|
||||
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||
<property name="text">
|
||||
<string>Smarten &punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="0">
|
||||
<widget class="QCheckBox" name="opt_unsmarten_punctuation">
|
||||
<property name="text">
|
||||
<string>&UnSmarten punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="3">
|
||||
<widget class="QCheckBox" name="opt_linearize_tables">
|
||||
<property name="text">
|
||||
<string>&Linearize tables</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
|
@ -7,26 +7,32 @@ import re
|
||||
from UserDict import UserDict
|
||||
|
||||
class MReplace(UserDict):
|
||||
def __init__(self, dict = None):
|
||||
UserDict.__init__(self, dict)
|
||||
|
||||
def __init__(self, data=None, case_sensitive=True):
|
||||
UserDict.__init__(self, data)
|
||||
self.re = None
|
||||
self.regex = None
|
||||
self.case_sensitive = case_sensitive
|
||||
self.compile_regex()
|
||||
|
||||
def compile_regex(self):
|
||||
def compile_regex(self):
|
||||
if len(self.data) > 0:
|
||||
keys = sorted(self.data.keys(), key=len)
|
||||
keys.reverse()
|
||||
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
||||
if self.re != tmp:
|
||||
self.re = tmp
|
||||
self.regex = re.compile(self.re)
|
||||
if self.case_sensitive:
|
||||
self.regex = re.compile(self.re)
|
||||
else:
|
||||
self.regex = re.compile(self.re, re.I)
|
||||
|
||||
def __call__(self, mo):
|
||||
def __call__(self, mo):
|
||||
return self[mo.string[mo.start():mo.end()]]
|
||||
|
||||
def mreplace(self, text):
|
||||
def mreplace(self, text):
|
||||
#Replace without regex compile
|
||||
if len(self.data) < 1 or self.re is None:
|
||||
return text
|
||||
return self.regex.sub(self, text)
|
||||
return self.regex.sub(self, text)
|
||||
|
||||
|
43
src/calibre/utils/unsmarten.py
Normal file
43
src/calibre/utils/unsmarten.py
Normal file
@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.utils.mreplace import MReplace
|
||||
|
||||
_mreplace = MReplace({
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'″': '"',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'″': '"',
|
||||
'“':'"',
|
||||
'”':'"',
|
||||
'″':'"',
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
}
|
||||
)
|
||||
unsmarten_text = _mreplace.mreplace
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user