mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make unsmarten punctuation a global option.
This commit is contained in:
parent
123991aea5
commit
ec448064aa
@ -134,7 +134,8 @@ def add_pipeline_options(parser, plumber):
|
||||
'font_size_mapping',
|
||||
'line_height', 'minimum_line_height',
|
||||
'linearize_tables',
|
||||
'extra_css', 'smarten_punctuation',
|
||||
'extra_css',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'change_justification',
|
||||
'insert_blank_line', 'insert_blank_line_size',
|
||||
|
@ -415,6 +415,13 @@ OptionRecommendation(name='smarten_punctuation',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='unsmarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||
'plain equivalents.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='read_metadata_from_opf',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
short_switch='m',
|
||||
|
@ -605,6 +605,9 @@ class HTMLPreProcessor(object):
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
|
||||
if getattr(self.extra_opts, 'unsmarten_punctuation', False):
|
||||
html = self.unsmarten_punctuation(html)
|
||||
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
if unsupported_unicode_chars:
|
||||
@ -636,3 +639,12 @@ class HTMLPreProcessor(object):
|
||||
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
def unsmarten_punctuation(self, html):
|
||||
from calibre.utils.unsmarten import unsmarten_html
|
||||
from calibre.ebooks.chardet import substitute_entites
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
html = unsmarten_html(html)
|
||||
return substitute_entites(html)
|
||||
|
||||
|
@ -7,9 +7,6 @@ __docformat__ = 'restructuredtext en'
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
|
||||
txt = txt_unsmarten(txt)
|
||||
|
||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
|
@ -15,7 +15,6 @@ from functools import partial
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
|
||||
class MarkdownMLizer(OEB2HTML):
|
||||
|
||||
@ -34,8 +33,6 @@ class MarkdownMLizer(OEB2HTML):
|
||||
self.style_italic = False
|
||||
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
if self.opts.unsmarten_punctuation:
|
||||
txt = unsmarten(txt)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
@ -56,10 +56,6 @@ class TXTOutput(OutputFormatPlugin):
|
||||
'* plain: Produce plain text.\n'
|
||||
'* markdown: Produce Markdown formatted text.\n'
|
||||
'* textile: Produce Textile formatted text.')),
|
||||
OptionRecommendation(name='unsmarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||
'plain equivalents.')),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only ' \
|
||||
|
@ -12,8 +12,6 @@ import re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'div',
|
||||
@ -78,8 +76,6 @@ class TXTMLizer(object):
|
||||
output += '\n\n\n\n\n\n'
|
||||
output = u''.join(output)
|
||||
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||
if self.opts.unsmarten_punctuation:
|
||||
output = unsmarten(output)
|
||||
output = self.cleanup_text(output)
|
||||
|
||||
return output
|
||||
|
@ -1,18 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||
|
||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||
|
||||
return txt
|
@ -22,13 +22,14 @@ class LookAndFeelWidget(Widget, Ui_Form):
|
||||
Widget.__init__(self, parent,
|
||||
['change_justification', 'extra_css', 'base_font_size',
|
||||
'font_size_mapping', 'line_height', 'minimum_line_height',
|
||||
'linearize_tables', 'smarten_punctuation',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'disable_font_rescaling', 'insert_blank_line',
|
||||
'remove_paragraph_spacing',
|
||||
'remove_paragraph_spacing_indent_size',
|
||||
'insert_blank_line_size',
|
||||
'input_encoding',
|
||||
'asciiize', 'keep_ligatures']
|
||||
'asciiize', 'keep_ligatures',
|
||||
'linearize_tables']
|
||||
)
|
||||
for val, text in [
|
||||
('original', _('Original')),
|
||||
|
@ -7,7 +7,7 @@
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>642</width>
|
||||
<height>500</height>
|
||||
<height>522</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
@ -84,7 +84,7 @@
|
||||
<string>...</string>
|
||||
</property>
|
||||
<property name="icon">
|
||||
<iconset resource="../../../../resources/images.qrc">
|
||||
<iconset>
|
||||
<normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
|
||||
</property>
|
||||
<property name="iconSize">
|
||||
@ -194,13 +194,6 @@
|
||||
<item row="8" column="2" colspan="3">
|
||||
<widget class="QComboBox" name="opt_change_justification"/>
|
||||
</item>
|
||||
<item row="9" column="0">
|
||||
<widget class="QCheckBox" name="opt_linearize_tables">
|
||||
<property name="text">
|
||||
<string>&Linearize tables</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="9" column="1" colspan="4">
|
||||
<widget class="QCheckBox" name="opt_asciiize">
|
||||
<property name="text">
|
||||
@ -215,7 +208,7 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="12" column="0" colspan="5">
|
||||
<item row="13" column="0" colspan="5">
|
||||
<widget class="QGroupBox" name="groupBox">
|
||||
<property name="title">
|
||||
<string>Extra &CSS</string>
|
||||
@ -240,13 +233,6 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="0">
|
||||
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||
<property name="text">
|
||||
<string>Smarten &punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="3">
|
||||
<widget class="QLabel" name="label_4">
|
||||
<property name="text">
|
||||
@ -273,6 +259,27 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="9" column="0">
|
||||
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||
<property name="text">
|
||||
<string>Smarten &punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="0">
|
||||
<widget class="QCheckBox" name="opt_unsmarten_punctuation">
|
||||
<property name="text">
|
||||
<string>&UnSmarten punctuation</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="3">
|
||||
<widget class="QCheckBox" name="opt_linearize_tables">
|
||||
<property name="text">
|
||||
<string>&Linearize tables</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
|
64
src/calibre/utils/unsmarten.py
Normal file
64
src/calibre/utils/unsmarten.py
Normal file
@ -0,0 +1,64 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml import html as lhtml
|
||||
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.ebooks.oeb.base import barename
|
||||
|
||||
def unsmarten_html(html):
|
||||
def dump_text(elem):
|
||||
text = []
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
tags.append(tag)
|
||||
# Turn the attributes into a string we can write with the tag.
|
||||
at = ''
|
||||
for k, v in attribs.items():
|
||||
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
|
||||
# Write the tag.
|
||||
text.append('<%s%s>' % (tag, at))
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
# Don't modify text in pre tags.
|
||||
if tag == 'pre':
|
||||
text.append(elem.text)
|
||||
else:
|
||||
text.append(prepare_string_for_xml(unsmarten_text(elem.text)))
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += dump_text(item)
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
text.append('</%s>' % t)
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
text.append(prepare_string_for_xml(unsmarten_text(elem.tail)))
|
||||
return text
|
||||
|
||||
content = lhtml.fromstring(html)
|
||||
html = dump_text(content)
|
||||
html = ''.join(html)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def unsmarten_text(txt):
|
||||
txt = re.sub(u'–|–|–', r'--', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'---', txt) # em-dash
|
||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||
|
||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||
|
||||
return txt
|
||||
|
Loading…
x
Reference in New Issue
Block a user