mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion: Add option to unsmarten puctuation under Look & Feel
This commit is contained in:
commit
c5c9738f63
@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'font_size_mapping',
|
'font_size_mapping',
|
||||||
'line_height', 'minimum_line_height',
|
'line_height', 'minimum_line_height',
|
||||||
'linearize_tables',
|
'linearize_tables',
|
||||||
'extra_css', 'smarten_punctuation',
|
'extra_css',
|
||||||
|
'smarten_punctuation', 'unsmarten_punctuation',
|
||||||
'margin_top', 'margin_left', 'margin_right',
|
'margin_top', 'margin_left', 'margin_right',
|
||||||
'margin_bottom', 'change_justification',
|
'margin_bottom', 'change_justification',
|
||||||
'insert_blank_line', 'insert_blank_line_size',
|
'insert_blank_line', 'insert_blank_line_size',
|
||||||
|
@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='unsmarten_punctuation',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||||
|
'plain equivalents.'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
short_switch='m',
|
short_switch='m',
|
||||||
@ -1017,6 +1024,10 @@ OptionRecommendation(name='sr3_replace',
|
|||||||
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
||||||
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
||||||
LinearizeTables()(self.oeb, self.opts)
|
LinearizeTables()(self.oeb, self.opts)
|
||||||
|
|
||||||
|
if self.opts.unsmarten_punctuation:
|
||||||
|
from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
|
||||||
|
UnsmartenPunctuation()(self.oeb, self.opts)
|
||||||
|
|
||||||
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
||||||
lineh=line_height,
|
lineh=line_height,
|
||||||
|
31
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
31
src/calibre/ebooks/oeb/transforms/unsmarten.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename
|
||||||
|
from calibre.utils.unsmarten import unsmarten_text
|
||||||
|
|
||||||
|
class UnsmartenPunctuation(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.html_tags = XPath('descendant::h:*')
|
||||||
|
|
||||||
|
def unsmarten(self, root):
|
||||||
|
for x in self.html_tags(root):
|
||||||
|
if not barename(x) == 'pre':
|
||||||
|
if getattr(x, 'text', None):
|
||||||
|
x.text = unsmarten_text(x.text)
|
||||||
|
if getattr(x, 'tail', None) and x.tail:
|
||||||
|
x.tail = unsmarten_text(x.tail)
|
||||||
|
|
||||||
|
def __call__(self, oeb, context):
|
||||||
|
bx = XPath('//h:body')
|
||||||
|
for x in oeb.manifest.items:
|
||||||
|
if x.media_type in OEB_DOCS:
|
||||||
|
for body in bx(x.data):
|
||||||
|
self.unsmarten(body)
|
||||||
|
|
@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
def unsmarten(txt):
|
def unsmarten(txt):
|
||||||
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
|
|
||||||
txt = txt_unsmarten(txt)
|
|
||||||
|
|
||||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||||
|
@ -15,7 +15,6 @@ from functools import partial
|
|||||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
|
||||||
|
|
||||||
class MarkdownMLizer(OEB2HTML):
|
class MarkdownMLizer(OEB2HTML):
|
||||||
|
|
||||||
@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
|
|||||||
self.style_italic = False
|
self.style_italic = False
|
||||||
|
|
||||||
txt = self.mlize_spine(oeb_book)
|
txt = self.mlize_spine(oeb_book)
|
||||||
if self.opts.unsmarten_punctuation:
|
|
||||||
txt = unsmarten(txt)
|
|
||||||
|
|
||||||
# Do some tidying up
|
# Do some tidying up
|
||||||
txt = self.tidy_up(txt)
|
txt = self.tidy_up(txt)
|
||||||
|
@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
'* plain: Produce plain text.\n'
|
'* plain: Produce plain text.\n'
|
||||||
'* markdown: Produce Markdown formatted text.\n'
|
'* markdown: Produce Markdown formatted text.\n'
|
||||||
'* textile: Produce Textile formatted text.')),
|
'* textile: Produce Textile formatted text.')),
|
||||||
OptionRecommendation(name='unsmarten_punctuation',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
|
||||||
'plain equivalents.')),
|
|
||||||
OptionRecommendation(name='keep_links',
|
OptionRecommendation(name='keep_links',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Do not remove links within the document. This is only ' \
|
help=_('Do not remove links within the document. This is only ' \
|
||||||
|
@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
for i in self.our_ids:
|
for i in self.our_ids:
|
||||||
if i not in self.our_links:
|
if i not in self.our_links:
|
||||||
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
||||||
|
|
||||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||||
text = check_escaping(text, ['\*', '_', '\*'])
|
text = check_escaping(text, ['\*', '_', '\*'])
|
||||||
# escape the super/sub-scripts if needed
|
# escape the super/sub-scripts if needed
|
||||||
@ -189,7 +189,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
emright = int(round(right / stylizer.profile.fbase))
|
emright = int(round(right / stylizer.profile.fbase))
|
||||||
if emright >= 1:
|
if emright >= 1:
|
||||||
txt += ')' * emright
|
txt += ')' * emright
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def check_id_tag(self, attribs):
|
def check_id_tag(self, attribs):
|
||||||
@ -235,7 +235,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
tags = []
|
tags = []
|
||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
attribs = elem.attrib
|
attribs = elem.attrib
|
||||||
|
|
||||||
# Ignore anything that is set to not be displayed.
|
# Ignore anything that is set to not be displayed.
|
||||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||||
or style['visibility'] == 'hidden':
|
or style['visibility'] == 'hidden':
|
||||||
@ -246,7 +246,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
||||||
if ems >= 1:
|
if ems >= 1:
|
||||||
text.append(u'\n\n\xa0' * ems)
|
text.append(u'\n\n\xa0' * ems)
|
||||||
|
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||||
if tag == 'div':
|
if tag == 'div':
|
||||||
tag = 'p'
|
tag = 'p'
|
||||||
@ -432,7 +432,7 @@ class TextileMLizer(OEB2HTML):
|
|||||||
'span', 'table', 'tr', 'td'):
|
'span', 'table', 'tr', 'td'):
|
||||||
if not self.in_a_link:
|
if not self.in_a_link:
|
||||||
text.append(self.check_styles(style))
|
text.append(self.check_styles(style))
|
||||||
|
|
||||||
# Process tags that contain text.
|
# Process tags that contain text.
|
||||||
if hasattr(elem, 'text') and elem.text:
|
if hasattr(elem, 'text') and elem.text:
|
||||||
txt = elem.text
|
txt = elem.text
|
||||||
|
@ -12,8 +12,6 @@ import re
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
|
||||||
|
|
||||||
|
|
||||||
BLOCK_TAGS = [
|
BLOCK_TAGS = [
|
||||||
'div',
|
'div',
|
||||||
@ -78,8 +76,6 @@ class TXTMLizer(object):
|
|||||||
output += '\n\n\n\n\n\n'
|
output += '\n\n\n\n\n\n'
|
||||||
output = u''.join(output)
|
output = u''.join(output)
|
||||||
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||||
if self.opts.unsmarten_punctuation:
|
|
||||||
output = unsmarten(output)
|
|
||||||
output = self.cleanup_text(output)
|
output = self.cleanup_text(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
@ -1,18 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
def unsmarten(txt):
|
|
||||||
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
|
||||||
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
|
||||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
|
||||||
|
|
||||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
|
||||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
|
||||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
|
||||||
|
|
||||||
return txt
|
|
@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['change_justification', 'extra_css', 'base_font_size',
|
['change_justification', 'extra_css', 'base_font_size',
|
||||||
'font_size_mapping', 'line_height', 'minimum_line_height',
|
'font_size_mapping', 'line_height', 'minimum_line_height',
|
||||||
'linearize_tables', 'smarten_punctuation',
|
'smarten_punctuation', 'unsmarten_punctuation',
|
||||||
'disable_font_rescaling', 'insert_blank_line',
|
'disable_font_rescaling', 'insert_blank_line',
|
||||||
'remove_paragraph_spacing',
|
'remove_paragraph_spacing',
|
||||||
'remove_paragraph_spacing_indent_size',
|
'remove_paragraph_spacing_indent_size',
|
||||||
'insert_blank_line_size',
|
'insert_blank_line_size',
|
||||||
'input_encoding',
|
'input_encoding',
|
||||||
'asciiize', 'keep_ligatures']
|
'asciiize', 'keep_ligatures',
|
||||||
|
'linearize_tables']
|
||||||
)
|
)
|
||||||
for val, text in [
|
for val, text in [
|
||||||
('original', _('Original')),
|
('original', _('Original')),
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>642</width>
|
<width>642</width>
|
||||||
<height>500</height>
|
<height>522</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<property name="windowTitle">
|
<property name="windowTitle">
|
||||||
@ -84,7 +84,7 @@
|
|||||||
<string>...</string>
|
<string>...</string>
|
||||||
</property>
|
</property>
|
||||||
<property name="icon">
|
<property name="icon">
|
||||||
<iconset resource="../../../../resources/images.qrc">
|
<iconset>
|
||||||
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
||||||
</property>
|
</property>
|
||||||
<property name="iconSize">
|
<property name="iconSize">
|
||||||
@ -194,13 +194,6 @@
|
|||||||
<item row="8" column="2" colspan="3">
|
<item row="8" column="2" colspan="3">
|
||||||
<widget class="QComboBox" name="opt_change_justification"/>
|
<widget class="QComboBox" name="opt_change_justification"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="9" column="0">
|
|
||||||
<widget class="QCheckBox" name="opt_linearize_tables">
|
|
||||||
<property name="text">
|
|
||||||
<string>&Linearize tables</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="9" column="1" colspan="4">
|
<item row="9" column="1" colspan="4">
|
||||||
<widget class="QCheckBox" name="opt_asciiize">
|
<widget class="QCheckBox" name="opt_asciiize">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
@ -215,7 +208,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="12" column="0" colspan="5">
|
<item row="13" column="0" colspan="5">
|
||||||
<widget class="QGroupBox" name="groupBox">
|
<widget class="QGroupBox" name="groupBox">
|
||||||
<property name="title">
|
<property name="title">
|
||||||
<string>Extra &CSS</string>
|
<string>Extra &CSS</string>
|
||||||
@ -240,13 +233,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="10" column="0">
|
|
||||||
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
|
||||||
<property name="text">
|
|
||||||
<string>Smarten &punctuation</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="6" column="3">
|
<item row="6" column="3">
|
||||||
<widget class="QLabel" name="label_4">
|
<widget class="QLabel" name="label_4">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
@ -273,6 +259,27 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="9" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||||
|
<property name="text">
|
||||||
|
<string>Smarten &punctuation</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="10" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_unsmarten_punctuation">
|
||||||
|
<property name="text">
|
||||||
|
<string>&UnSmarten punctuation</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="10" column="3">
|
||||||
|
<widget class="QCheckBox" name="opt_linearize_tables">
|
||||||
|
<property name="text">
|
||||||
|
<string>&Linearize tables</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
|
@ -7,26 +7,32 @@ import re
|
|||||||
from UserDict import UserDict
|
from UserDict import UserDict
|
||||||
|
|
||||||
class MReplace(UserDict):
|
class MReplace(UserDict):
|
||||||
def __init__(self, dict = None):
|
|
||||||
UserDict.__init__(self, dict)
|
def __init__(self, data=None, case_sensitive=True):
|
||||||
|
UserDict.__init__(self, data)
|
||||||
self.re = None
|
self.re = None
|
||||||
self.regex = None
|
self.regex = None
|
||||||
|
self.case_sensitive = case_sensitive
|
||||||
self.compile_regex()
|
self.compile_regex()
|
||||||
|
|
||||||
def compile_regex(self):
|
def compile_regex(self):
|
||||||
if len(self.data) > 0:
|
if len(self.data) > 0:
|
||||||
keys = sorted(self.data.keys(), key=len)
|
keys = sorted(self.data.keys(), key=len)
|
||||||
keys.reverse()
|
keys.reverse()
|
||||||
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
tmp = "(%s)" % "|".join(map(re.escape, keys))
|
||||||
if self.re != tmp:
|
if self.re != tmp:
|
||||||
self.re = tmp
|
self.re = tmp
|
||||||
self.regex = re.compile(self.re)
|
if self.case_sensitive:
|
||||||
|
self.regex = re.compile(self.re)
|
||||||
|
else:
|
||||||
|
self.regex = re.compile(self.re, re.I)
|
||||||
|
|
||||||
def __call__(self, mo):
|
def __call__(self, mo):
|
||||||
return self[mo.string[mo.start():mo.end()]]
|
return self[mo.string[mo.start():mo.end()]]
|
||||||
|
|
||||||
def mreplace(self, text):
|
def mreplace(self, text):
|
||||||
#Replace without regex compile
|
#Replace without regex compile
|
||||||
if len(self.data) < 1 or self.re is None:
|
if len(self.data) < 1 or self.re is None:
|
||||||
return text
|
return text
|
||||||
return self.regex.sub(self, text)
|
return self.regex.sub(self, text)
|
||||||
|
|
||||||
|
43
src/calibre/utils/unsmarten.py
Normal file
43
src/calibre/utils/unsmarten.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from calibre.utils.mreplace import MReplace
|
||||||
|
|
||||||
|
_mreplace = MReplace({
|
||||||
|
'–': '--',
|
||||||
|
'–': '--',
|
||||||
|
'–': '--',
|
||||||
|
'—': '---',
|
||||||
|
'—': '---',
|
||||||
|
'—': '---',
|
||||||
|
'…': '...',
|
||||||
|
'…': '...',
|
||||||
|
'…': '...',
|
||||||
|
'“': '"',
|
||||||
|
'”': '"',
|
||||||
|
'″': '"',
|
||||||
|
'“': '"',
|
||||||
|
'”': '"',
|
||||||
|
'″': '"',
|
||||||
|
'“':'"',
|
||||||
|
'”':'"',
|
||||||
|
'″':'"',
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
unsmarten_text = _mreplace.mreplace
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user