improve filename/path conversion from unihandecode

2025-08-30 23:00:21 -04:00 · 2010-12-25 23:55:07 +09:00 · 2010-12-25 23:55:07 +09:00 · 0a22d4af1a
commit 0a22d4af1a
parent 44ca3ea808
12 changed files with 21330 additions and 1560 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
 import functools, re

 from calibre import entity_to_unicode
+from calibre.utils.config import prefs

 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 SVG_NS       = 'http://www.w3.org/2000/svg'
@ -523,9 +524,9 @@ class HTMLPreProcessor(object):
        html = XMLDECL_RE.sub('', html)

        if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
-            html = unidecoder.decode(html)
+            from calibre.ebooks.unihandecode import Unihandecoder
+            unihandecoder = Unihandecoder(lang=prefs['language'])
+            html = unihandecoder.decode(html)

        if self.plugin_preprocess:
            html = self.input_plugin_preprocess(self.extra_opts, html)
@ -535,10 +536,10 @@ class HTMLPreProcessor(object):

        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
+            from calibre.ebooks.unihandecode import Unihandecoder
+            unihandecoder = Unihandecoder(lang=prefs['language'])
            for char in unsupported_unicode_chars:
-                asciichar = unidecoder.decode(char)
+                asciichar = unihandecoder.decode(char)
                html = html.replace(char, asciichar)

        return html
--- a/src/calibre/ebooks/unidecode/init.py
+++ b/src/calibre/ebooks/unidecode/init.py
--- a/src/calibre/ebooks/unihandecode/init.py
+++ b/src/calibre/ebooks/unihandecode/init.py
@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+__all__ = ["Unihandecoder"]
+
+'''
+Decode unicode text to an ASCII representation of the text. 
+Translate unicode characters to ASCII.
+
+inspired from John's unidecode library.
+Copyright(c) 2009, John Schember
+
+Tranliterate the string from unicode characters to ASCII in Chinese and others.
+
+'''
+
+from unidecoder import Unidecoder
+from jadecoder import Jadecoder
+from krdecoder import Krdecoder
+
+class Unihandecoder(object):
+    preferred_encoding = None
+    lang = None
+
+    def __init__(self, lang="zh", encoding='utf-8'):
+        self.preferred_encoding = encoding
+        self.lang = lang
+
+    def decode(self, text):
+        '''
+        example  convert:  "明天明天的风吹", "明日は明日の風が吹く" 
+          and "내일은 내일 바람이 분다"
+        >>> d = Unihandecoder(lang="zh")
+        >>> print d.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
+        Ming Tian Ming Tian De Feng Chui 
+        >>> d = Unihandecoder(lang="ja")
+        >>> print d.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
+        Ashita ha Ashita no Kaze ga Fuku
+        >>> d = Unihandecoder(lang="kr")
+        >>> print d.decode(u'\ub0b4\uc77c\uc740 \ub0b4\uc77c \ubc14\ub78c\uc774 \ubd84\ub2e4')
+        naeileun naeil barami bunda
+
+        '''
+
+        if not isinstance(text, unicode):
+            try:
+                text = unicode(text)
+            except:
+                try:
+                    text = text.decode(self.preferred_encoding)
+                except:
+                    text = text.decode('utf-8', 'replace')
+
+        if self.lang is "ja":
+            d = Jadecoder()
+            return d.decode(text)
+        elif self.lang is "kr":
+            d = Krdecoder()
+            return d.decode(text)
+        else:
+            d = Unidecoder()
+            return d.decode(text)
+
+def _test():
+	import doctest
+	doctest.testmod()
+
+if __name__ == "__main__":
+	_test()
+
--- a/src/calibre/ebooks/unihandecode/jacodepoints.py
+++ b/src/calibre/ebooks/unihandecode/jacodepoints.py
--- a/src/calibre/ebooks/unihandecode/jadecoder.py
+++ b/src/calibre/ebooks/unihandecode/jadecoder.py
@ -0,0 +1,84 @@
+# coding:utf8
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text for Japanese.
+ Translate unicode string to ASCII roman string.
+
+API is based on the python unidecode,
+which is based on Ruby gem (http://rubyforge.org/projects/unidecode/) 
+and  perl module Text::Unidecode
+(http://search.cpan.org/~sburke/Text-Unidecode-0.04/). 
+
+This functionality is owned by Kakasi Japanese processing engine.
+
+Copyright (c) 2010 Hiroshi Miura
+'''
+
+from ctypes import *
+import os, re
+from unidecoder import Unidecoder
+from unicodepoints import CODEPOINTS
+from jacodepoints import CODEPOINTS as JACODES
+
+class Jadecoder(Unidecoder):
+
+    #kakasi instance
+    kakasi = None
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(JACODES)
+
+        try:
+            if os.name is "nt":
+                self.kakasi = CDLL("libkakasi")
+            elif os.name is "posix":
+                self.kakasi = CDLL("libkakasi.so")
+            else:
+                self.kakasi = None
+        except:
+            self.kakasi = None
+
+    def decode(self, text):
+        '''
+        Translate the string from unicode characters to ASCII in Japanese.
+        example convert "明日は明日の風が吹く", and "明天明天的风吹"
+        >>> k = Jadecoder()
+        >>> print k.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
+        Ashita ha Ashita no Kaze ga Fuku
+        >>> print k.decode(u'\u660e\u5929\u660e\u5929\u7684\u98ce\u5439')
+        MeiTenMeiTenTekiSui
+        '''        
+
+        # if there is not kakasi library, we fall down to use unidecode
+        if self.kakasi is None:
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
+
+        numopt = 9
+        argArray = c_char_p * numopt
+        args =  argArray( c_char_p("kakasi")
+                               ,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
+                               ,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
+                               ,c_char_p("-ieuc")
+                              )
+        self.kakasi.kakasi_getopt_argv(numopt, args)
+        kakasi_do = self.kakasi.kakasi_do
+        kakasi_do.restype = c_char_p
+
+        try:
+            cstr = c_char_p(text.encode("eucjp"))
+            return kakasi_do(cstr).decode("eucjp")
+        except:
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
+
+def _test():
+	import doctest
+	doctest.testmod()
+
+if __name__ == "__main__":
+	_test()
--- a/src/calibre/ebooks/unihandecode/krcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/krcodepoints.py
--- a/src/calibre/ebooks/unihandecode/krdecoder.py
+++ b/src/calibre/ebooks/unihandecode/krdecoder.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Korean.
+Based on unidecoder.
+
+'''
+
+import re
+from unidecoder import Unidecoder
+from krcodepoints import CODEPOINTS as HANCODES
+from unicodepoints import CODEPOINTS
+
+class Krdecoder(Unidecoder):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
+    def decode(self, text):
+        '''
+        example  convert 
+        >>> h = Krdecoder()
+        >>> print h.decode(u"내일은 내일 바람이 분다")
+        naeileun naeil barami bunda
+        >>> print h.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
+        MyengIlhaMyengIlnoPhwunggaChwiku
+        '''
+        # Replace characters larger than 127 with their ASCII equivelent.
+        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
+            text)
+
+    def replace_point(self, codepoint):
+        '''
+        Returns the replacement character or ? if none can be found.
+        '''
+        try:
+            # Split the unicode character xABCD into parts 0xAB and 0xCD.
+            # 0xAB represents the group within CODEPOINTS to query and 0xCD
+            # represents the position in the list of characters for the group.
+            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
+                codepoint)]
+        except:
+            return '?'
+
+def _test():
+	import doctest
+	doctest.testmod()
+
+if __name__ == "__main__":
+	_test()
+
--- a/src/calibre/ebooks/unihandecode/unicodepoints.py
+++ b/src/calibre/ebooks/unihandecode/unicodepoints.py
--- a/src/calibre/ebooks/unihandecode/unidecoder.py
+++ b/src/calibre/ebooks/unihandecode/unidecoder.py
@ -1,12 +1,16 @@
 # -*- coding: utf-8 -*-

 __license__ = 'GPL 3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'

 '''
-Decode unicode text to an ASCII representation of the text. Transliterate
-unicode characters to ASCII.
+Decode unicode text to an ASCII representation of the text in Chinese. 
+Transliterate unicode characters to ASCII based on chinese pronounce.
+
+derived from John's unidecode library.
+
+Copyright(c) 2009, John Schember

 Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
 is based on the perl module Text::Unidecode
@ -55,29 +59,29 @@ it under the same terms as Perl itself.
 '''

 import re
-
-from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
-from calibre.constants import preferred_encoding
+from unicodepoints import CODEPOINTS
+from zhcodepoints import CODEPOINTS as HANCODES

 class Unidecoder(object):

+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
    def decode(self, text):
        '''
-        Tranliterate the string from unicode characters to ASCII.
+        Tranliterate the string from unicode characters to ASCII in Chinese and others.
+        example  convert:  "明天明天的风吹" and "明日は明日の風が吹く"
+        >>> u = Unidecoder()
+        >>> print u.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
+        Ming Tian Ming Tian De Feng Chui 
+        >>> print u.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
+        Ming Ri haMing Ri noFeng gaChui ku
        '''
-        # The keys for CODEPOINTS is unicode characters, we want to be sure the
-        # input text is unicode.
-        if not isinstance(text, unicode):
-            try:
-                text = unicode(text)
-            except:
-                try:
-                    text = text.decode(preferred_encoding)
-                except:
-                    text = text.decode('utf-8', 'replace')
        # Replace characters larger than 127 with their ASCII equivelent.
-        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
-            text)
+        return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)

    def replace_point(self, codepoint):
        '''
@ -87,10 +91,10 @@ class Unidecoder(object):
            # Split the unicode character xABCD into parts 0xAB and 0xCD.
            # 0xAB represents the group within CODEPOINTS to query and 0xCD
            # represents the position in the list of characters for the group.
-            return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
+            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
                codepoint)]
        except:
-            return '?'
+            return ''

    def code_group(self, character):
        '''
@ -106,3 +110,10 @@ class Unidecoder(object):
        '''
        return ord(unicode(character)) & 255

+def _test():
+	import doctest
+	doctest.testmod()
+
+if __name__ == "__main__":
+	_test()
+
--- a/src/calibre/ebooks/unihandecode/vncodepoints.py
+++ b/src/calibre/ebooks/unihandecode/vncodepoints.py
--- a/src/calibre/ebooks/unihandecode/zhcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/zhcodepoints.py
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@ -6,7 +6,6 @@ meaning as possible.
 import os
 from math import ceil

-from calibre.ebooks.unidecode.unidecoder import Unidecoder
 from calibre import sanitize_file_name
 from calibre.constants import preferred_encoding, iswindows
 udc = Unidecoder()