mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Implement #2846 (Convert common unicode punctuation to ascii.)
This commit is contained in:
commit
ee1c2e4cad
@ -1,4 +1,4 @@
|
||||
from __future__ import with_statement
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Defines the plugin system for conversions.
|
||||
'''
|
||||
|
@ -125,6 +125,7 @@ def add_pipeline_options(parser, plumber):
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'dont_justify',
|
||||
'insert_blank_line', 'remove_paragraph_spacing',
|
||||
'asciiize',
|
||||
]
|
||||
),
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from __future__ import with_statement
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
@ -348,6 +348,20 @@ OptionRecommendation(name='read_metadata_from_opf',
|
||||
'file.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='asciiize',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=(_('Transliterate unicode characters to an ASCII '
|
||||
'representation. Use with care because this will remove replace '
|
||||
'unicode characters with ASCII. For instance it will replace "%s" '
|
||||
'with "Mikhail Gorbachiov". Also, note that in '
|
||||
'cases where there are multiple representations of a character '
|
||||
'(characters shared by Chinese and Japanese for instance) the '
|
||||
'representation used by the largest number of people will be '
|
||||
'used (Chinese in the previous example).')%\
|
||||
u'\u041c\u0438\u0445\u0430\u0438\u043b '
|
||||
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='title',
|
||||
recommended_value=None, level=OptionRecommendation.LOW,
|
||||
|
@ -221,6 +221,11 @@ class HTMLPreProcessor(object):
|
||||
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if getattr(self.extra_opts, 'asciiize', False):
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(html)
|
||||
|
||||
|
0
src/calibre/ebooks/unidecode/__init__.py
Normal file
0
src/calibre/ebooks/unidecode/__init__.py
Normal file
3256
src/calibre/ebooks/unidecode/unicodepoints.py
Normal file
3256
src/calibre/ebooks/unidecode/unicodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
104
src/calibre/ebooks/unidecode/unidecoder.py
Normal file
104
src/calibre/ebooks/unidecode/unidecoder.py
Normal file
@ -0,0 +1,104 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text. Transliterate
|
||||
unicode characters to ASCII.
|
||||
|
||||
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
||||
is based on the perl module Text::Unidecode
|
||||
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/). More information about
|
||||
unidecode can be found at
|
||||
http://interglacial.com/~sburke/tpj/as_html/tpj22.html.
|
||||
|
||||
The major differences between this implementation and others is it's written in
|
||||
python and it uses a single dictionary instead of loading the code group files
|
||||
as needed.
|
||||
|
||||
|
||||
Copyright (c) 2007 Russell Norris
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
Copyright 2001, Sean M. Burke <sburke@cpan.org>, all rights reserved.
|
||||
|
||||
The programs and documentation in this dist are distributed in the
|
||||
hope that they will be useful, but without any warranty; without even
|
||||
the implied warranty of merchantability or fitness for a particular
|
||||
purpose.
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
|
||||
|
||||
class Unidecoder(object):
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
Tranliterate the string from unicode characters to ASCII.
|
||||
'''
|
||||
# The keys for CODEPOINTS is unicode characters, we want to be sure the
|
||||
# input text is unicode.
|
||||
if not isinstance(text, unicode):
|
||||
try:
|
||||
text = unicode(text)
|
||||
except:
|
||||
text = text.decode('utf-8', 'ignore')
|
||||
# Replace characters larger than 127 with their ASCII equivelent.
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
||||
text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
Returns the replacement character or ? if none can be found.
|
||||
'''
|
||||
try:
|
||||
# Splite the unicode character xABCD into parts 0xAB and 0xCD.
|
||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||
# represents the position in the list of characters for the group.
|
||||
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
|
||||
codepoint)]
|
||||
except:
|
||||
return '?'
|
||||
|
||||
def code_group(self, character):
|
||||
'''
|
||||
Find what group character is a part of.
|
||||
'''
|
||||
# Code groups withing CODEPOINTS take the form 'xAB'
|
||||
return u'x%02x' % (ord(unicode(character)) >> 8)
|
||||
|
||||
def grouped_point(self, character):
|
||||
'''
|
||||
Return the location the replacement character is in the list for a
|
||||
the group character is a part of.
|
||||
'''
|
||||
return ord(unicode(character)) & 255
|
||||
|
@ -22,7 +22,8 @@ class LookAndFeelWidget(Widget, Ui_Form):
|
||||
'font_size_mapping', 'line_height',
|
||||
'linearize_tables',
|
||||
'disable_font_rescaling', 'insert_blank_line',
|
||||
'remove_paragraph_spacing', 'input_encoding']
|
||||
'remove_paragraph_spacing', 'input_encoding',
|
||||
'asciiize']
|
||||
)
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -89,6 +89,13 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="9" column="0" colspan="3">
|
||||
<widget class="QCheckBox" name="opt_asciiize">
|
||||
<property name="text">
|
||||
<string>&Transliterate unicode characters to ASCII.</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
|
Loading…
x
Reference in New Issue
Block a user