improve filename/path conversion from unihandecode

2025-07-09 03:04:10 -04:00 · 2010-12-25 23:55:07 +09:00 · 2010-12-25 23:55:07 +09:00 · 0a22d4af1a
commit 0a22d4af1a
parent 44ca3ea808
12 changed files with 21330 additions and 1560 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
 import functools, re
 from calibre import entity_to_unicode
 from calibre.utils.config import prefs
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 SVG_NS       = 'http://www.w3.org/2000/svg'
@ -523,9 +524,9 @@ class HTMLPreProcessor(object):
        html = XMLDECL_RE.sub('', html)
        if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            from calibre.ebooks.unihandecode import Unihandecoder
-            unidecoder = Unidecoder()
+            unihandecoder = Unihandecoder(lang=prefs['language'])
-            html = unidecoder.decode(html)
+            html = unihandecoder.decode(html)
        if self.plugin_preprocess:
            html = self.input_plugin_preprocess(self.extra_opts, html)
@ -535,10 +536,10 @@ class HTMLPreProcessor(object):
        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
+            from calibre.ebooks.unihandecode import Unihandecoder
-            unidecoder = Unidecoder()
+            unihandecoder = Unihandecoder(lang=prefs['language'])
            for char in unsupported_unicode_chars:
-                asciichar = unidecoder.decode(char)
+                asciichar = unihandecoder.decode(char)
                html = html.replace(char, asciichar)
        return html
--- a/src/calibre/ebooks/unidecode/init.py
+++ b/src/calibre/ebooks/unidecode/init.py
--- a/src/calibre/ebooks/unihandecode/init.py
+++ b/src/calibre/ebooks/unihandecode/init.py
@ -0,0 +1,72 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'
 __all__ = ["Unihandecoder"]
 '''
 Decode unicode text to an ASCII representation of the text. 
 Translate unicode characters to ASCII.
 inspired from John's unidecode library.
 Copyright(c) 2009, John Schember
 Tranliterate the string from unicode characters to ASCII in Chinese and others.
 '''
 from unidecoder import Unidecoder
 from jadecoder import Jadecoder
 from krdecoder import Krdecoder
 class Unihandecoder(object):
    preferred_encoding = None
    lang = None
    def __init__(self, lang="zh", encoding='utf-8'):
        self.preferred_encoding = encoding
        self.lang = lang
    def decode(self, text):
        '''
        example  convert:  "明天明天的风吹", "明日は明日の風が吹く" 
          and "내일은 내일 바람이 분다"
        >>> d = Unihandecoder(lang="zh")
        >>> print d.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
        Ming Tian Ming Tian De Feng Chui 
        >>> d = Unihandecoder(lang="ja")
        >>> print d.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
        Ashita ha Ashita no Kaze ga Fuku
        >>> d = Unihandecoder(lang="kr")
        >>> print d.decode(u'\ub0b4\uc77c\uc740 \ub0b4\uc77c \ubc14\ub78c\uc774 \ubd84\ub2e4')
        naeileun naeil barami bunda
        '''
        if not isinstance(text, unicode):
            try:
                text = unicode(text)
            except:
                try:
                    text = text.decode(self.preferred_encoding)
                except:
                    text = text.decode('utf-8', 'replace')
        if self.lang is "ja":
            d = Jadecoder()
            return d.decode(text)
        elif self.lang is "kr":
            d = Krdecoder()
            return d.decode(text)
        else:
            d = Unidecoder()
            return d.decode(text)
 def _test():
 	import doctest
 	doctest.testmod()
 if __name__ == "__main__":
 	_test()
--- a/src/calibre/ebooks/unihandecode/jacodepoints.py
+++ b/src/calibre/ebooks/unihandecode/jacodepoints.py
--- a/src/calibre/ebooks/unihandecode/jadecoder.py
+++ b/src/calibre/ebooks/unihandecode/jadecoder.py
@ -0,0 +1,84 @@
 # coding:utf8
 __license__ = 'GPL 3'
 __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'
 '''
 Decode unicode text to an ASCII representation of the text for Japanese.
 Translate unicode string to ASCII roman string.
 API is based on the python unidecode,
 which is based on Ruby gem (http://rubyforge.org/projects/unidecode/) 
 and  perl module Text::Unidecode
 (http://search.cpan.org/~sburke/Text-Unidecode-0.04/). 
 This functionality is owned by Kakasi Japanese processing engine.
 Copyright (c) 2010 Hiroshi Miura
 '''
 from ctypes import *
 import os, re
 from unidecoder import Unidecoder
 from unicodepoints import CODEPOINTS
 from jacodepoints import CODEPOINTS as JACODES
 class Jadecoder(Unidecoder):
    #kakasi instance
    kakasi = None
    codepoints = {}
    def __init__(self):
        self.codepoints = CODEPOINTS
        self.codepoints.update(JACODES)
        try:
            if os.name is "nt":
                self.kakasi = CDLL("libkakasi")
            elif os.name is "posix":
                self.kakasi = CDLL("libkakasi.so")
            else:
                self.kakasi = None
        except:
            self.kakasi = None
    def decode(self, text):
        '''
        Translate the string from unicode characters to ASCII in Japanese.
        example convert "明日は明日の風が吹く", and "明天明天的风吹"
        >>> k = Jadecoder()
        >>> print k.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
        Ashita ha Ashita no Kaze ga Fuku
        >>> print k.decode(u'\u660e\u5929\u660e\u5929\u7684\u98ce\u5439')
        MeiTenMeiTenTekiSui
        '''        
        # if there is not kakasi library, we fall down to use unidecode
        if self.kakasi is None:
            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
        numopt = 9
        argArray = c_char_p * numopt
        args =  argArray( c_char_p("kakasi")
                               ,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
                               ,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
                               ,c_char_p("-ieuc")
                              )
        self.kakasi.kakasi_getopt_argv(numopt, args)
        kakasi_do = self.kakasi.kakasi_do
        kakasi_do.restype = c_char_p
        try:
            cstr = c_char_p(text.encode("eucjp"))
            return kakasi_do(cstr).decode("eucjp")
        except:
            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
 def _test():
 	import doctest
 	doctest.testmod()
 if __name__ == "__main__":
 	_test()
--- a/src/calibre/ebooks/unihandecode/krcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/krcodepoints.py
--- a/src/calibre/ebooks/unihandecode/krdecoder.py
+++ b/src/calibre/ebooks/unihandecode/krdecoder.py
@ -0,0 +1,58 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'
 '''
 Decode unicode text to an ASCII representation of the text in Korean.
 Based on unidecoder.
 '''
 import re
 from unidecoder import Unidecoder
 from krcodepoints import CODEPOINTS as HANCODES
 from unicodepoints import CODEPOINTS
 class Krdecoder(Unidecoder):
    codepoints = {}
    def __init__(self):
        self.codepoints = CODEPOINTS
        self.codepoints.update(HANCODES)
    def decode(self, text):
        '''
        example  convert 
        >>> h = Krdecoder()
        >>> print h.decode(u"내일은 내일 바람이 분다")
        naeileun naeil barami bunda
        >>> print h.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
        MyengIlhaMyengIlnoPhwunggaChwiku
        '''
        # Replace characters larger than 127 with their ASCII equivelent.
        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
            text)
    def replace_point(self, codepoint):
        '''
        Returns the replacement character or ? if none can be found.
        '''
        try:
            # Split the unicode character xABCD into parts 0xAB and 0xCD.
            # 0xAB represents the group within CODEPOINTS to query and 0xCD
            # represents the position in the list of characters for the group.
            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
                codepoint)]
        except:
            return '?'
 def _test():
 	import doctest
 	doctest.testmod()
 if __name__ == "__main__":
 	_test()
--- a/src/calibre/ebooks/unihandecode/unicodepoints.py
+++ b/src/calibre/ebooks/unihandecode/unicodepoints.py
--- a/src/calibre/ebooks/unihandecode/unidecoder.py
+++ b/src/calibre/ebooks/unihandecode/unidecoder.py
@ -1,12 +1,16 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'
 '''
-Decode unicode text to an ASCII representation of the text. Transliterate
+Decode unicode text to an ASCII representation of the text in Chinese. 
-unicode characters to ASCII.
+Transliterate unicode characters to ASCII based on chinese pronounce.
 derived from John's unidecode library.
 Copyright(c) 2009, John Schember
 Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
 is based on the perl module Text::Unidecode
@ -55,29 +59,29 @@ it under the same terms as Perl itself.
 '''
 import re
-
+from unicodepoints import CODEPOINTS
-from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
+from zhcodepoints import CODEPOINTS as HANCODES
 from calibre.constants import preferred_encoding
 class Unidecoder(object):
    codepoints = {}
    def __init__(self):
        self.codepoints = CODEPOINTS
        self.codepoints.update(HANCODES)
    def decode(self, text):
        '''
-        Tranliterate the string from unicode characters to ASCII.
+        Tranliterate the string from unicode characters to ASCII in Chinese and others.
        example  convert:  "明天明天的风吹" and "明日は明日の風が吹く"
        >>> u = Unidecoder()
        >>> print u.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
        Ming Tian Ming Tian De Feng Chui 
        >>> print u.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
        Ming Ri haMing Ri noFeng gaChui ku
        '''
        # The keys for CODEPOINTS is unicode characters, we want to be sure the
        # input text is unicode.
        if not isinstance(text, unicode):
            try:
                text = unicode(text)
            except:
                try:
                    text = text.decode(preferred_encoding)
                except:
                    text = text.decode('utf-8', 'replace')
        # Replace characters larger than 127 with their ASCII equivelent.
-        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
+        return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
            text)
    def replace_point(self, codepoint):
        '''
@ -87,10 +91,10 @@ class Unidecoder(object):
            # Split the unicode character xABCD into parts 0xAB and 0xCD.
            # 0xAB represents the group within CODEPOINTS to query and 0xCD
            # represents the position in the list of characters for the group.
-            return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
+            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
                codepoint)]
        except:
-            return '?'
+            return ''
    def code_group(self, character):
        '''
@ -106,3 +110,10 @@ class Unidecoder(object):
        '''
        return ord(unicode(character)) & 255
 def _test():
 	import doctest
 	doctest.testmod()
 if __name__ == "__main__":
 	_test()
--- a/src/calibre/ebooks/unihandecode/vncodepoints.py
+++ b/src/calibre/ebooks/unihandecode/vncodepoints.py
--- a/src/calibre/ebooks/unihandecode/zhcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/zhcodepoints.py
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@ -6,7 +6,6 @@ meaning as possible.
 import os
 from math import ceil
 from calibre.ebooks.unidecode.unidecoder import Unidecoder
 from calibre import sanitize_file_name
 from calibre.constants import preferred_encoding, iswindows
 udc = Unidecoder()