diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index c605df0de4..ed332acac2 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber): 'font_size_mapping', 'line_height', 'minimum_line_height', 'linearize_tables', - 'extra_css', 'smarten_punctuation', + 'extra_css', + 'smarten_punctuation', 'unsmarten_punctuation', 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'change_justification', 'insert_blank_line', 'insert_blank_line_size', diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index adff954e62..3e5313eb96 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation', ) ), +OptionRecommendation(name='unsmarten_punctuation', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Convert fancy quotes, dashes and ellipsis to their ' + 'plain equivalents.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', @@ -1017,6 +1024,10 @@ OptionRecommendation(name='sr3_replace', self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) + + if self.opts.unsmarten_punctuation: + from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation + UnsmartenPunctuation()(self.oeb, self.opts) flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=line_height, diff --git a/src/calibre/ebooks/oeb/transforms/unsmarten.py b/src/calibre/ebooks/oeb/transforms/unsmarten.py new file mode 100644 index 0000000000..c01094681f --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/unsmarten.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.oeb.base import OEB_DOCS, XPath, barename +from calibre.utils.unsmarten import unsmarten_text + +class UnsmartenPunctuation(object): + + def __init__(self): + self.html_tags = XPath('descendant::h:*') + + def unsmarten(self, root): + for x in self.html_tags(root): + if not barename(x) == 'pre': + if getattr(x, 'text', None): + x.text = unsmarten_text(x.text) + if getattr(x, 'tail', None) and x.tail: + x.tail = unsmarten_text(x.tail) + + def __call__(self, oeb, context): + bx = XPath('//h:body') + for x in oeb.manifest.items: + if x.media_type in OEB_DOCS: + for body in bx(x.data): + self.unsmarten(body) + diff --git a/src/calibre/ebooks/textile/unsmarten.py b/src/calibre/ebooks/textile/unsmarten.py index c31bb77c24..94127c5c39 100644 --- a/src/calibre/ebooks/textile/unsmarten.py +++ b/src/calibre/ebooks/textile/unsmarten.py @@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en' import re def unsmarten(txt): - from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten - txt = txt_unsmarten(txt) - txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent txt = re.sub(u'£|£|£', r'{L-}', txt) # pound txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 878633add3..79cfabf65e 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -15,7 +15,6 @@ from functools import partial from calibre.ebooks.htmlz.oeb2html import OEB2HTML from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from calibre.ebooks.oeb.stylizer import Stylizer -from calibre.ebooks.txt.unsmarten import unsmarten class MarkdownMLizer(OEB2HTML): @@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML): self.style_italic = False txt = self.mlize_spine(oeb_book) - if self.opts.unsmarten_punctuation: - txt = unsmarten(txt) # Do some tidying up txt = self.tidy_up(txt) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 47ee734a04..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin): '* plain: Produce plain text.\n' '* markdown: Produce Markdown formatted text.\n' '* textile: Produce Textile formatted text.')), - OptionRecommendation(name='unsmarten_punctuation', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Convert fancy quotes, dashes and ellipsis to their ' - 'plain equivalents.')), OptionRecommendation(name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not remove links within the document. This is only ' \ diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index 500ce1d9c7..de712abd07 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -83,7 +83,7 @@ class TextileMLizer(OEB2HTML): for i in self.our_ids: if i not in self.our_links: text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) - + # Remove obvious non-needed escaping, add sub/sup-script ones text = check_escaping(text, ['\*', '_', '\*']) # escape the super/sub-scripts if needed @@ -189,7 +189,7 @@ class TextileMLizer(OEB2HTML): emright = int(round(right / stylizer.profile.fbase)) if emright >= 1: txt += ')' * emright - + return txt def check_id_tag(self, attribs): @@ -235,7 +235,7 @@ class TextileMLizer(OEB2HTML): tags = [] tag = barename(elem.tag) attribs = elem.attrib - + # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': @@ -246,7 +246,7 @@ class TextileMLizer(OEB2HTML): ems = int(round(float(style.marginTop) / style.fontSize) - 1) if ems >= 1: text.append(u'\n\n\xa0' * ems) - + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' @@ -432,7 +432,7 @@ class TextileMLizer(OEB2HTML): 'span', 'table', 'tr', 'td'): if not self.in_a_link: text.append(self.check_styles(style)) - + # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 5d2b03d98e..2320fbbbc7 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -12,8 +12,6 @@ import re from lxml import etree -from calibre.ebooks.txt.unsmarten import unsmarten - BLOCK_TAGS = [ 'div', @@ -78,8 +76,6 @@ class TXTMLizer(object): output += '\n\n\n\n\n\n' output = u''.join(output) output = u'\n'.join(l.rstrip() for l in output.splitlines()) - if self.opts.unsmarten_punctuation: - output = unsmarten(output) output = self.cleanup_text(output) return output diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py deleted file mode 100644 index 53f686c2fc..0000000000 --- a/src/calibre/ebooks/txt/unsmarten.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, Leigh Parry ' -__docformat__ = 'restructuredtext en' - -import re - -def unsmarten(txt): - txt = re.sub(u'–|–|–', r'-', txt) # en-dash - txt = re.sub(u'—|—|—', r'--', txt) # em-dash - txt = re.sub(u'…|…|…', r'...', txt) # ellipsis - - txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote - txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe - txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote - - return txt diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index 9b008c0d6d..5ca7e1ea02 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['change_justification', 'extra_css', 'base_font_size', 'font_size_mapping', 'line_height', 'minimum_line_height', - 'linearize_tables', 'smarten_punctuation', + 'smarten_punctuation', 'unsmarten_punctuation', 'disable_font_rescaling', 'insert_blank_line', 'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size', 'insert_blank_line_size', 'input_encoding', - 'asciiize', 'keep_ligatures'] + 'asciiize', 'keep_ligatures', + 'linearize_tables'] ) for val, text in [ ('original', _('Original')), diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 0aa91e0f47..055d569212 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -7,7 +7,7 @@ 0 0 642 - 500 + 522 @@ -84,7 +84,7 @@ ... - + :/images/wizard.png:/images/wizard.png @@ -194,13 +194,6 @@ - - - - &Linearize tables - - - @@ -215,7 +208,7 @@ - + Extra &CSS @@ -240,13 +233,6 @@ - - - - Smarten &punctuation - - - @@ -273,6 +259,27 @@ + + + + Smarten &punctuation + + + + + + + &UnSmarten punctuation + + + + + + + &Linearize tables + + + diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py index b9fbc0bded..70591d6ca7 100644 --- a/src/calibre/utils/mreplace.py +++ b/src/calibre/utils/mreplace.py @@ -7,26 +7,32 @@ import re from UserDict import UserDict class MReplace(UserDict): - def __init__(self, dict = None): - UserDict.__init__(self, dict) + + def __init__(self, data=None, case_sensitive=True): + UserDict.__init__(self, data) self.re = None self.regex = None + self.case_sensitive = case_sensitive self.compile_regex() - def compile_regex(self): + def compile_regex(self): if len(self.data) > 0: keys = sorted(self.data.keys(), key=len) keys.reverse() tmp = "(%s)" % "|".join(map(re.escape, keys)) if self.re != tmp: self.re = tmp - self.regex = re.compile(self.re) + if self.case_sensitive: + self.regex = re.compile(self.re) + else: + self.regex = re.compile(self.re, re.I) - def __call__(self, mo): + def __call__(self, mo): return self[mo.string[mo.start():mo.end()]] - def mreplace(self, text): + def mreplace(self, text): #Replace without regex compile if len(self.data) < 1 or self.re is None: return text - return self.regex.sub(self, text) \ No newline at end of file + return self.regex.sub(self, text) + diff --git a/src/calibre/utils/unsmarten.py b/src/calibre/utils/unsmarten.py new file mode 100644 index 0000000000..7944f710b0 --- /dev/null +++ b/src/calibre/utils/unsmarten.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.utils.mreplace import MReplace + +_mreplace = MReplace({ + '–': '--', + '–': '--', + '–': '--', + '—': '---', + '—': '---', + '—': '---', + '…': '...', + '…': '...', + '…': '...', + '“': '"', + '”': '"', + '″': '"', + '“': '"', + '”': '"', + '″': '"', + '“':'"', + '”':'"', + '″':'"', + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", +} +) +unsmarten_text = _mreplace.mreplace + +