Implement #2846 (Convert common unicode punctuation to ascii.)

2025-07-07 10:14:46 -04:00 · 2009-07-25 11:29:09 -06:00 · 2009-07-25 11:29:09 -06:00 · ee1c2e4cad
commit ee1c2e4cad
parent 791ec3af5b 52cb54a4b5
9 changed files with 3391 additions and 3 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -1,4 +1,4 @@
-from __future__ import with_statement
+# -*- coding: utf-8 -*-
 '''
 Defines the plugin system for conversions.
 '''
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -125,6 +125,7 @@ def add_pipeline_options(parser, plumber):
                      'margin_top', 'margin_left', 'margin_right',
                      'margin_bottom', 'dont_justify',
                      'insert_blank_line', 'remove_paragraph_spacing',
+                      'asciiize',
                  ]
                  ),

--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -1,4 +1,4 @@
-from __future__ import with_statement
+# -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
@ -348,6 +348,20 @@ OptionRecommendation(name='read_metadata_from_opf',
                   'file.')
        ),

+OptionRecommendation(name='asciiize',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=(_('Transliterate unicode characters to an ASCII '
+            'representation. Use with care because this will remove replace '
+            'unicode characters with ASCII. For instance it will replace "%s" '
+            'with "Mikhail Gorbachiov". Also, note that in '
+            'cases where there are multiple representations of a character '
+            '(characters shared by Chinese and Japanese for instance) the '
+            'representation used by the largest number of people will be '
+            'used (Chinese in the previous example).')%\
+            u'\u041c\u0438\u0445\u0430\u0438\u043b '
+            u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
+)
+        ),

 OptionRecommendation(name='title',
    recommended_value=None, level=OptionRecommendation.LOW,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -221,6 +221,11 @@ class HTMLPreProcessor(object):

        html = XMLDECL_RE.sub('', html)

+        if getattr(self.extra_opts, 'asciiize', False):
+            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            unidecoder = Unidecoder()
+            html = unidecoder.decode(html)
+
        if self.plugin_preprocess:
            html = self.input_plugin_preprocess(html)

--- a/src/calibre/ebooks/unidecode/init.py
+++ b/src/calibre/ebooks/unidecode/init.py
--- a/src/calibre/ebooks/unidecode/unicodepoints.py
+++ b/src/calibre/ebooks/unidecode/unicodepoints.py
--- a/src/calibre/ebooks/unidecode/unidecoder.py
+++ b/src/calibre/ebooks/unidecode/unidecoder.py
@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text. Transliterate
+unicode characters to ASCII.
+
+Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
+is based on the perl module Text::Unidecode
+(http://search.cpan.org/~sburke/Text-Unidecode-0.04/). More information about
+unidecode can be found at
+http://interglacial.com/~sburke/tpj/as_html/tpj22.html.
+
+The major differences between this implementation and others is it's written in
+python and it uses a single dictionary instead of loading the code group files
+as needed.
+
+
+Copyright (c) 2007 Russell Norris
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+
+Copyright 2001, Sean M. Burke <sburke@cpan.org>, all rights reserved.
+
+The programs and documentation in this dist are distributed in the
+hope that they will be useful, but without any warranty; without even
+the implied warranty of merchantability or fitness for a particular
+purpose.
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+'''
+
+import re
+
+from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
+
+class Unidecoder(object):
+
+    def decode(self, text):
+        '''
+        Tranliterate the string from unicode characters to ASCII.
+        '''
+        # The keys for CODEPOINTS is unicode characters, we want to be sure the
+        # input text is unicode.
+        if not isinstance(text, unicode):
+            try:
+                text = unicode(text)
+            except:
+                text = text.decode('utf-8', 'ignore')
+        # Replace characters larger than 127 with their ASCII equivelent.
+        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
+            text)
+
+    def replace_point(self, codepoint):
+        '''
+        Returns the replacement character or ? if none can be found.
+        '''
+        try:
+            # Splite the unicode character xABCD into parts 0xAB and 0xCD.
+            # 0xAB represents the group within CODEPOINTS to query and 0xCD
+            # represents the position in the list of characters for the group.
+            return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
+                codepoint)]
+        except:
+            return '?'
+
+    def code_group(self, character):
+        '''
+        Find what group character is a part of.
+        '''
+        # Code groups withing CODEPOINTS take the form 'xAB'
+        return u'x%02x' % (ord(unicode(character)) >> 8)
+
+    def grouped_point(self, character):
+        '''
+        Return the location the replacement character is in the list for a
+        the group character is a part of.
+        '''
+        return ord(unicode(character)) & 255
+
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@ -22,7 +22,8 @@ class LookAndFeelWidget(Widget, Ui_Form):
                    'font_size_mapping', 'line_height',
                    'linearize_tables',
                    'disable_font_rescaling', 'insert_blank_line',
-                    'remove_paragraph_spacing', 'input_encoding']
+                    'remove_paragraph_spacing', 'input_encoding',
+                    'asciiize']
                )
        self.db, self.book_id = db, book_id
        self.initialize_options(get_option, get_help, db, book_id)
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@ -89,6 +89,13 @@
       </property>
      </widget>
     </item>
+     <item row="9" column="0" colspan="3">
+      <widget class="QCheckBox" name="opt_asciiize">
+       <property name="text">
+        <string>&amp;Transliterate unicode characters to ASCII.</string>
+       </property>
+      </widget>
+     </item>
     <item row="2" column="0">
      <widget class="QLabel" name="label_2">
       <property name="text">