When converting non english texts to english, use the users currect calibre interface language. This allows japanes/korean/vietnamese characters to be correctly converted. Previously they were assumed to be Chinese. Fixes #7622 (Calibre need to switch logic when converting Unicode filename into ASCII)

2025-07-09 03:04:10 -04:00 · 2011-02-14 10:50:28 -07:00 · 2011-02-14 10:50:28 -07:00 · c4f06e39af
commit c4f06e39af
parent 99fffaef8a 411adb488c
27 changed files with 146309 additions and 3292 deletions
--- a/27
+++ b/27
@ -193,6 +193,33 @@ License: GPL-3
 The full text of the GPL is distributed as in
 /usr/share/common-licenses/GPL-3 on Debian systems.

+Files: src/calibre/ebooks/unihandecode/pykakasi/*
+Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
+Copyright: 1992, Hironobu Takahashi
+License: GPL-2+
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL on Debian systems.
+
+Files: resources/kanwadict2.db
+Files: resources/itaijidict2.pickle
+Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
+Copyright: 1992 1993 1994, Hironobu Takahashi (takahasi@tiny.or.jp),
+Copyright: 1992 1993 1994, Masahiko Sato (masahiko@sato.riec.tohoku.ac.jp),
+Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
+Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
+License: GPL-2+
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL on Debian systems.
+
+Files: src/calibre/ebooks/unihandecode/*
+Copyright: 2010-2011, Hiroshi Miura <miurahr@linux.com>
+Copyright: 2009, John Schember
+Copyright: 2007, Russell Norris
+Copyright: 2001, Sean M. Burke
+License: GPL-3, Perl
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL-3 on Debian systems.
+
 Files: src/encutils/__init__.py
 Copyright: 2005-2008: Christof Hoeke
 License: LGPL-3+, CC-BY-3.0
--- a/setup/resources.py
+++ b/setup/resources.py
@ -6,9 +6,10 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, cPickle
+import os, cPickle, re, anydbm, shutil
+from zlib import compress

-from setup import Command, basenames
+from setup import Command, basenames, __appname__

 def get_opts_from_parser(parser):
    def do_opt(opt):
@ -26,6 +27,9 @@ class Resources(Command):

    description = 'Compile various needed calibre resources'

+    KAKASI_PATH = os.path.join(Command.SRC,  __appname__,
+            'ebooks', 'unihandecode', 'pykakasi')
+
    def run(self, opts):
        scripts = {}
        for x in ('console', 'gui'):
@ -101,11 +105,113 @@ class Resources(Command):
        import json
        json.dump(function_dict, open(dest, 'wb'), indent=4)

+        self.run_kakasi(opts)
+
+    def run_kakasi(self, opts):
+        self.records = {}
+        src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','kanwadict2.db')
+        base = os.path.dirname(dest)
+        if not os.path.exists(base):
+            os.makedirs(base)
+
+        if not self.newer(dest, src):
+            self.info('\tKanwadict is up to date')
+        else:
+            self.info('\tGenerating Kanwadict')
+
+            for line in open(src, "r"):
+                self.parsekdict(line)
+            self.kanwaout(dest)
+
+        src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','itaijidict2.pickle')
+
+        if not self.newer(dest, src):
+            self.info('\tItaijidict is up to date')
+        else:
+            self.info('\tGenerating Itaijidict')
+            self.mkitaiji(src, dest)
+
+        src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','kanadict2.pickle')
+
+        if not self.newer(dest, src):
+            self.info('\tKanadict is up to date')
+        else:
+            self.info('\tGenerating kanadict')
+            self.mkkanadict(src, dest)
+
+        return
+
+
+    def mkitaiji(self, src, dst):
+        dic = {}
+        for line in open(src, "r"):
+            line = line.decode("utf-8").strip()
+            if line.startswith(';;'): # skip comment
+                continue
+            if re.match(r"^$",line):
+                continue
+            pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
+            dic[pair[0]] = pair[1]
+        cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
+
+    def mkkanadict(self, src, dst):
+        dic = {}
+        for line in open(src, "r"):
+            line = line.decode("utf-8").strip()
+            if line.startswith(';;'): # skip comment
+                continue
+            if re.match(r"^$",line):
+                continue
+            (alpha, kana) = line.split(' ')
+            dic[kana] = alpha
+        cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
+
+    def parsekdict(self, line):
+        line = line.decode("utf-8").strip()
+        if line.startswith(';;'): # skip comment
+            return
+        (yomi, kanji) = line.split(' ')
+        if ord(yomi[-1:]) <= ord('z'):
+            tail = yomi[-1:]
+            yomi = yomi[:-1]
+        else:
+            tail = ''
+        self.updaterec(kanji, yomi, tail)
+
+    def updaterec(self, kanji, yomi, tail):
+            key = "%04x"%ord(kanji[0])
+            if key in self.records:
+                if kanji in self.records[key]:
+                    rec = self.records[key][kanji]
+                    rec.append((yomi,tail))
+                    self.records[key].update( {kanji: rec} )
+                else:
+                    self.records[key][kanji]=[(yomi, tail)]
+            else:
+                self.records[key] = {}
+                self.records[key][kanji]=[(yomi, tail)]
+
+    def kanwaout(self, out):
+        dic = anydbm.open(out, 'c')
+        for (k, v) in self.records.iteritems():
+            dic[k] = compress(cPickle.dumps(v, -1))
+        dic.close()
+
+
    def clean(self):
        for x in ('scripts', 'recipes', 'ebook-convert-complete'):
            x = self.j(self.RESOURCES, x+'.pickle')
            if os.path.exists(x):
                os.remove(x)
+        kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
+        if os.path.exists(kakasi):
+            shutil.rmtree(kakasi)



--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -402,8 +402,8 @@ OptionRecommendation(name='asciiize',
            'with "Mikhail Gorbachiov". Also, note that in '
            'cases where there are multiple representations of a character '
            '(characters shared by Chinese and Japanese for instance) the '
-            'representation used by the largest number of people will be '
-            'used (Chinese in the previous example).')%\
+            'representation based on the current calibre interface language will be '
+            'used.')%\
            u'\u041c\u0438\u0445\u0430\u0438\u043b '
            u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
 )
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -543,9 +543,9 @@ class HTMLPreProcessor(object):
        html = XMLDECL_RE.sub('', html)

        if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
-            html = unidecoder.decode(html)
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
+            html = unihandecoder.decode(html)

        if getattr(self.extra_opts, 'enable_heuristics', False):
            from calibre.ebooks.conversion.utils import HeuristicProcessor
@ -557,10 +557,10 @@ class HTMLPreProcessor(object):

        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
            for char in unsupported_unicode_chars:
-                asciichar = unidecoder.decode(char)
+                asciichar = unihandecoder.decode(char)
                html = html.replace(char, asciichar)

        return html
--- a/src/calibre/ebooks/unidecode/init.py
+++ b/src/calibre/ebooks/unidecode/init.py
--- a/src/calibre/ebooks/unidecode/unicodepoints.py
+++ b/src/calibre/ebooks/unidecode/unicodepoints.py
--- a/src/calibre/ebooks/unihandecode/init.py
+++ b/src/calibre/ebooks/unihandecode/init.py
@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+__all__ = ["Unihandecoder"]
+
+'''
+Decode unicode text to an ASCII representation of the text.
+Translate unicode characters to ASCII.
+
+inspired from John's unidecode library.
+Copyright(c) 2009, John Schember
+
+Tranliterate the string from unicode characters to ASCII in Chinese and others.
+
+'''
+import unicodedata
+
+class Unihandecoder(object):
+    preferred_encoding = None
+    decoder = None
+
+    def __init__(self, lang="zh", encoding='utf-8'):
+        self.preferred_encoding = encoding
+        lang = lang.lower()
+        if lang[:2] == u'ja':
+            from calibre.ebooks.unihandecode.jadecoder import Jadecoder
+            self.decoder = Jadecoder()
+        elif lang[:2] == u'kr' or lang == u'korean':
+            from calibre.ebooks.unihandecode.krdecoder import Krdecoder
+            self.decoder = Krdecoder()
+        elif lang[:2] == u'vn' or lang == u'vietnum':
+            from calibre.ebooks.unihandecode.vndecoder import Vndecoder
+            self.decoder = Vndecoder()
+        else: #zh and others
+            from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+            self.decoder = Unidecoder()
+
+    def decode(self, text):
+        try:
+            unicode # python2
+            if not isinstance(text, unicode):
+                try:
+                    text = unicode(text)
+                except:
+                    try:
+                        text = text.decode(self.preferred_encoding)
+                    except:
+                        text = text.decode('utf-8', 'replace')
+        except: # python3, str is unicode
+            pass
+        #at first unicode normalize it. (see Unicode standards)
+        ntext = unicodedata.normalize('NFKC', text)
+        return self.decoder.decode(ntext)
--- a/src/calibre/ebooks/unihandecode/jacodepoints.py
+++ b/src/calibre/ebooks/unihandecode/jacodepoints.py
--- a/src/calibre/ebooks/unihandecode/jadecoder.py
+++ b/src/calibre/ebooks/unihandecode/jadecoder.py
@ -0,0 +1,41 @@
+# coding:utf8
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text for Japanese.
+ Translate unicode string to ASCII roman string.
+
+API is based on the python unidecode,
+which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
+and  perl module Text::Unidecode
+(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
+
+This functionality is owned by Kakasi Japanese processing engine.
+
+Copyright (c) 2010 Hiroshi Miura
+'''
+
+import re
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
+from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
+
+class Jadecoder(Unidecoder):
+    kakasi = None
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(JACODES)
+        self.kakasi = kakasi()
+
+    def decode(self, text):
+        try:
+            result=self.kakasi.do(text)
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
+        except:
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
+
--- a/src/calibre/ebooks/unihandecode/krcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/krcodepoints.py
--- a/src/calibre/ebooks/unihandecode/krdecoder.py
+++ b/src/calibre/ebooks/unihandecode/krdecoder.py
@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Korean.
+Based on unidecoder.
+
+'''
+
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+
+class Krdecoder(Unidecoder):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/init.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/init.py
@ -0,0 +1,5 @@
+from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
+kakasi
+
+__all__ = ["pykakasi"]
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/h2a.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/h2a.py
@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+#  h2a.py
+#
+# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
+#
+# Original copyright:
+# * KAKASI (Kanji Kana Simple inversion program)
+# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
+# * Copyright (C) 1992
+# * Hironobu Takahashi (takahasi@tiny.or.jp)
+# *
+# * This program is free software; you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation; either versions 2, or (at your option)
+# * any later version.
+# *
+# * This program is distributed in the hope that it will be useful
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with KAKASI, see the file COPYING.  If not, write to the Free
+# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
+# * 02111-1307, USA.
+# */
+
+class H2a (object):
+
+    H2a_table = {
+        u"\u3041":"a", u"\u3042":"a",
+        u"\u3043":"i", u"\u3044":"i",
+        u"\u3045":"u", u"\u3046":"u",
+        u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va",
+        u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve",
+        u"\u3046\u309b\u3049":"vo",
+        u"\u3047":"e", u"\u3048":"e",
+        u"\u3049":"o", u"\u304a":"o",
+
+        u"\u304b":"ka", u"\u304c":"ga",
+        u"\u304d":"ki", u"\u304d\u3041":"kya",
+        u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo",
+        u"\u304e":"gi", u"\u3050\u3083":"gya",
+        u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo",
+        u"\u304f":"ku", u"\u3050":"gu",
+        u"\u3051":"ke", u"\u3052":"ge",
+        u"\u3053":"ko", u"\u3054":"go",
+
+        u"\u3055":"sa", u"\u3056":"za",
+        u"\u3057":"shi", u"\u3057\u3083":"sha",
+        u"\u3057\u3085":"shu", u"\u3057\u3087":"sho",
+        u"\u3058":"ji", u"\u3058\u3083":"ja",
+        u"\u3058\u3085":"ju", u"\u3058\u3087":"jo",
+        u"\u3059":"su", u"\u305a":"zu",
+        u"\u305b":"se", u"\u305c":"ze",
+        u"\u305d":"so", u"\u305e":"zo",
+
+        u"\u305f":"ta", u"\u3060":"da",
+        u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha",
+        u"\u3061\u3085":"chu", u"\u3061\u3087":"cho",
+        u"\u3062":"ji", u"\u3062\u3083":"ja",
+        u"\u3062\u3085":"ju", u"\u3062\u3087":"jo",
+
+        u"\u3063":"tsu",
+        u"\u3063\u3046\u309b":"vvu",
+        u"\u3063\u3046\u309b\u3041":"vva",
+        u"\u3063\u3046\u309b\u3043":"vvi",
+        u"\u3063\u3046\u309b\u3047":"vve",
+        u"\u3063\u3046\u309b\u3049":"vvo",
+        u"\u3063\u304b":"kka", u"\u3063\u304c":"gga",
+        u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya",
+        u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo",
+        u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya",
+        u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo",
+        u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu",
+        u"\u3063\u3051":"kke", u"\u3063\u3052":"gge",
+        u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo",
+        u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza",
+        u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha",
+        u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho",
+        u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja",
+        u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo",
+        u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu",
+        u"\u3063\u305b":"sse", u"\u3063\u305e":"zze",
+        u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo",
+        u"\u3063\u305f":"tta", u"\u3063\u3060":"dda",
+        u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha",
+        u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho",
+        u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya",
+        u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo",
+        u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu",
+        u"\u3063\u3066":"tte", u"\u3063\u3067":"dde",
+        u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo",
+        u"\u3063\u306f":"hha", u"\u3063\u3070":"bba",
+        u"\u3063\u3071":"ppa",
+        u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya",
+        u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo",
+        u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya",
+        u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo",
+        u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya",
+        u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo",
+        u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa",
+        u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe",
+        u"\u3063\u3075\u3049":"ffo",
+        u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu",
+        u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe",
+        u"\u3063\u307a":"ppe",
+        u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo",
+        u"\u3063\u307d":"ppo",
+        u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu",
+        u"\u3063\u3088":"yyo",
+        u"\u3063\u3089":"rra", u"\u3063\u308a":"rri",
+        u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu",
+        u"\u3063\u308a\u3087":"rryo",
+        u"\u3063\u308b":"rru", u"\u3063\u308c":"rre",
+        u"\u3063\u308d":"rro",
+
+        u"\u3064":"tsu", u"\u3065":"zu",
+        u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di",
+        u"\u3068":"to", u"\u3069":"do",
+
+        u"\u306a":"na",
+        u"\u306b":"ni", u"\u306b\u3083":"nya",
+        u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo",
+        u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no",
+
+        u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa",
+        u"\u3072":"hi", u"\u3072\u3083":"hya",
+        u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo",
+        u"\u3073":"bi", u"\u3073\u3083":"bya",
+        u"\u3073\u3085":"byu", u"\u3073\u3087":"byo",
+        u"\u3074":"pi", u"\u3074\u3083":"pya",
+        u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo",
+        u"\u3075":"fu", u"\u3075\u3041":"fa",
+        u"\u3075\u3043":"fi", u"\u3075\u3047":"fe",
+        u"\u3075\u3049":"fo",
+        u"\u3076":"bu", u"\u3077":"pu",
+        u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe",
+        u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po",
+
+        u"\u307e":"ma",
+        u"\u307f":"mi", u"\u307f\u3083":"mya",
+        u"\u307f\u3085":"myu", u"\u307f\u3087":"myo",
+        u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo",
+
+        u"\u3083":"ya", u"\u3084":"ya",
+        u"\u3085":"yu", u"\u3086":"yu",
+        u"\u3087":"yo", u"\u3088":"yo",
+
+        u"\u3089":"ra",
+        u"\u308a":"ri", u"\u308a\u3083":"rya",
+        u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo",
+        u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro",
+
+        u"\u308e":"wa", u"\u308f":"wa",
+        u"\u3090":"i", u"\u3091":"e",
+        u"\u3092":"wo", u"\u3093":"n",
+
+        u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i",
+        u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e",
+        u"\u3093\u304a":"n'o",
+    }
+
+# this class is Borg
+    _shared_state = {}
+
+    def __new__(cls, *p, **k):
+        self = object.__new__(cls, *p, **k)
+        self.__dict__ = cls._shared_state
+        return self
+
+    def isHiragana(self, char):
+        return ( 0x3040 < ord(char) and ord(char) < 0x3094)
+
+    def convert(self, text):
+        Hstr = ""
+        max_len = -1
+        r = min(4, len(text)+1)
+        for x in xrange(r):
+            if text[:x] in self.H2a_table:
+                if max_len < x:
+                    max_len = x
+                    Hstr = self.H2a_table[text[:x]]
+        return (Hstr, max_len)
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/itaijidict.utf8
+++ b/src/calibre/ebooks/unihandecode/pykakasi/itaijidict.utf8
@ -0,0 +1,564 @@
+芦蘆
+壱一
+苅刈
+舘館
+曽曾
+菟兎
+島嶋
+盃杯
+冨富
+峯峰
+亘亙
+弌一
+乘乗
+亂乱
+豫予
+亊事
+弍二
+亞亜
+亰京
+从従
+仭仞
+佛仏
+來来
+儘侭
+伜倅
+假仮
+會会
+做作
+傳伝
+僞偽
+價価
+儉倹
+兒児
+兔兎
+竸競
+兩両
+囘回
+册冊
+冢塚
+冩写
+决決
+冱冴
+冰氷
+况況
+凉涼
+處処
+凾函
+刄刃
+刔抉
+刧劫
+剩剰
+劍剣
+劔剣
+劒剣
+剱剣
+劑剤
+辨弁
+勞労
+勳勲
+勵励
+勸勧
+區区
+卆卒
+丗世
+凖準
+夘卯
+卻却
+卷巻
+厠廁
+厦廈
+厮廝
+厰廠
+參参
+雙双
+咒呪
+單単
+噐器
+營営
+嚏嚔
+嚴厳
+囑嘱
+囓齧
+圀国
+圈圏
+國国
+圍囲
+圓円
+團団
+圖図
+埀垂
+埓埒
+塲場
+壞壊
+墮堕
+壓圧
+壘塁
+壥廛
+壤壌
+壯壮
+壺壷
+壹一
+壻婿
+壽寿
+夂夊
+夛多
+梦夢
+竒奇
+奧奥
+奬奨
+侫佞
+姙妊
+嫻嫺
+孃嬢
+學学
+斈学
+寃冤
+寇冦
+寢寝
+寫写
+寶宝
+寳宝
+尅剋
+將将
+專専
+對対
+尓爾
+尢尤
+屆届
+屬属
+峽峡
+嶌嶋
+嵜崎
+崙崘
+嵳嵯
+嶽岳
+巛川
+巵卮
+帋紙
+帶帯
+幤幣
+廐厩
+廏厩
+廣広
+廚厨
+廢廃
+廳庁
+廰庁
+廸迪
+弃棄
+弉奘
+彜彝
+彈弾
+彌弥
+弯彎
+徃往
+徑径
+從従
+徠来
+悳徳
+恠怪
+恆恒
+悧俐
+惡悪
+惠恵
+忰悴
+惱悩
+愼慎
+愽博
+慘惨
+慚慙
+憇憩
+應応
+懷懐
+懴懺
+戀恋
+戞戛
+戰戦
+戲戯
+拔抜
+拏拿
+擔担
+拜拝
+拂払
+挾挟
+搜捜
+插挿
+搖揺
+攝摂
+攪撹
+據拠
+擇択
+擧拳
+舉拳
+抬擡
+擴拡
+攜携
+攵攴
+攷考
+收収
+效効
+敕勅
+敍叙
+敘叙
+數数
+變変
+斷断
+旙旛
+昜陽
+晄晃
+晉晋
+晝昼
+晰晢
+暎映
+曉暁
+暸瞭
+昿曠
+曵曳
+朖朗
+朞期
+霸覇
+杤栃
+杰傑
+枩松
+檜桧
+條条
+檮梼
+梹檳
+棊棋
+棧桟
+棕椶
+楙茂
+榮栄
+槨椁
+樂楽
+權権
+樞枢
+樣様
+樓楼
+橢楕
+檢検
+櫻桜
+鬱欝
+盜盗
+飮飲
+歐嘔
+歡歓
+歸帰
+殘残
+殱殲
+殼殻
+毆殴
+毓育
+氣気
+沒没
+泪涙
+濤涛
+渕淵
+渊淵
+淨浄
+淺浅
+滿満
+溂剌
+溪渓
+灌潅
+滯滞
+澁渋
+澀渋
+潛潜
+濳潜
+澂澄
+澑溜
+澤沢
+濟済
+濕湿
+濱浜
+濾滬
+灣湾
+烱炯
+烟煙
+熈煕
+熏燻
+燒焼
+爐炉
+爭争
+爲為
+爼俎
+犁犂
+犹猶
+犲豺
+狹狭
+獎奨
+默黙
+獨独
+獸獣
+獵猟
+獻献
+珎珍
+璢瑠
+瑯琅
+珱瓔
+瓣弁
+甞嘗
+甼町
+畄留
+畍界
+畊耕
+畆畝
+畧略
+畫画
+當当
+畴疇
+疊畳
+疉畳
+疂畳
+癡痴
+發発
+皃猊
+皈帰
+皹皸
+盖蓋
+盡尽
+蘯盪
+眞真
+眦眥
+礦鉱
+礪砺
+碎砕
+碯瑙
+祕秘
+祿禄
+齋斎
+禪禅
+禮礼
+禀稟
+稱称
+稻稲
+稾稿
+穗穂
+穩穏
+龝穐
+穰穣
+窗窓
+竈竃
+窰窯
+竊窃
+竝並
+筺筐
+笋筍
+箟箘
+筝箏
+簔蓑
+籠篭
+籘籐
+籖籤
+粹粋
+糺糾
+絲糸
+經経
+總総
+緜綿
+縣県
+縱縦
+繪絵
+繩縄
+繼継
+緕纃
+續続
+纖繊
+纎繊
+纜繿
+缺欠
+罐缶
+罸罰
+羃冪
+羣群
+羮羹
+譱善
+翆翠
+翦剪
+耻恥
+聟婿
+聨聯
+聲声
+聰聡
+聽聴
+肅粛
+冐冒
+脉脈
+腦脳
+腟膣
+膓腸
+膸髄
+膽胆
+臈臘
+臟臓
+臺台
+與与
+舊旧
+舍舎
+舖舗
+舩船
+艢檣
+舮艫
+艷艶
+莖茎
+莊荘
+莵兎
+菷帚
+萠萌
+蕚萼
+蒂蔕
+萬万
+葢蓋
+蘂蕊
+蕋蕊
+藪薮
+藏蔵
+藝芸
+藥薬
+蘓蘇
+乕虎
+號号
+蠣蛎
+蝨虱
+蠅蝿
+螢蛍
+蟆蟇
+蟲虫
+蠏蟹
+蟷螳
+蟒蠎
+蠶蚕
+蠧蠹
+蠻蛮
+衂衄
+衞衛
+袵衽
+裝装
+襃褒
+褝襌
+覩睹
+覺覚
+覽覧
+觀観
+觧解
+觸触
+誡戒
+謌歌
+諡謚
+謠謡
+證証
+譛譖
+譯訳
+譽誉
+讀読
+讓譲
+讚賛
+豐豊
+貉狢
+貍狸
+貎猊
+豼貔
+貘獏
+戝財
+貭質
+貳弐
+貮弐
+賤賎
+賣売
+贊賛
+賍贓
+赱走
+踈疎
+踴踊
+躰体
+軆体
+軈軅
+軣轟
+輕軽
+輙輒
+輌輛
+轉転
+辭辞
+辯弁
+迯逃
+逹達
+逎遒
+遞逓
+遲遅
+邊辺
+邉辺
+邨村
+鄰隣
+醉酔
+醫医
+釀醸
+釋釈
+釡釜
+釼剣
+銕鉄
+錢銭
+鎭鎮
+鐵鉄
+鐡鉄
+鑒鑑
+鑄鋳
+鑛鉱
+鈩鑪
+鑚鑽
+閇閉
+濶闊
+關関
+阯址
+陷陥
+險険
+隱隠
+隸隷
+襍雑
+雜雑
+靈霊
+靜静
+靱靭
+韭韮
+韲齏
+韵韻
+顏顔
+顯顕
+飃飄
+餘余
+餝飾
+餠餅
+騷騒
+驅駆
+驛駅
+驗験
+髓髄
+體体
+髮髪
+鬪闘
+鰺鯵
+鰛鰮
+鳬鳧
+鳫鴈
+鵄鴟
+鵞鵝
+鷄鶏
+鷏鷆
+鹽塩
+麥麦
+麸麩
+麪麺
+點点
+黨党
+皷鼓
+鼡鼠
+齊斉
+齒歯
+齡齢
+龜亀
+槇槙
+遙遥
+瑤瑶
+凜凛
+熙煕
--- a/src/calibre/ebooks/unihandecode/pykakasi/j2h.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/j2h.py
@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+#  j2h.py
+#
+# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
+#
+#  Original Copyright:
+# * KAKASI (Kanji Kana Simple inversion program)
+# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
+# * Copyright (C) 1992
+# * Hironobu Takahashi (takahasi@tiny.or.jp)
+# *
+# * This program is free software; you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation; either versions 2, or (at your option)
+# * any later version.
+# *
+# * This program is distributed in the hope that it will be useful
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with KAKASI, see the file COPYING.  If not, write to the Free
+# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
+# * 02111-1307, USA.
+# */
+
+from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
+import re
+
+class J2H (object):
+
+    kanwa = None
+
+    cl_table = [
+	"","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow",
+	"aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k",
+	"g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc",
+	"d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n",
+	"n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b",
+	"p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl",
+	"rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k",
+	"k", "", "", "", "", "", "", "", "", ""]
+
+    def __init__(self):
+        self.kanwa = jisyo()
+
+    def isKanji(self, c):
+        return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e)
+
+    def isCletter(self, l, c):
+        if (ord(u"ぁ") <= ord(c) and  ord(c) <= 0x309f) and (  l in self.cl_table[ord(c) - ord(u"ぁ")-1]):
+            return True
+        return False
+
+    def itaiji_conv(self, text):
+        r = []
+        for c in text:
+            if c in self.kanwa.itaijidict:
+                r.append(c)
+        for c in r:
+            text = re.sub(c, self.kanwa.itaijidict[c], text)
+        return text
+
+    def convert(self, text):
+        max_len = 0
+        Hstr = ""
+        table = self.kanwa.load_jisyo(text[0])
+        if table is None:
+            return ("", 0)
+        for (k,v) in table.iteritems():
+            length = len(k)
+            if len(text) >= length:
+                if text.startswith(k):
+                    for  (yomi, tail) in v:
+                        if tail is '':
+                            if max_len < length:
+                                Hstr = yomi
+                                max_len = length
+                        elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]):
+                            Hstr=''.join([yomi,text[length]])
+                            max_len = length+1
+        return (Hstr, max_len)
--- a/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+#  jisyo.py
+#
+# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
+from cPickle import load
+import anydbm,marshal
+from zlib import decompress
+import os
+
+import calibre.utils.resources as resources
+
+class jisyo (object):
+    kanwadict = None
+    itaijidict = None
+    kanadict = None
+    jisyo_table = {}
+
+# this class is Borg
+    _shared_state = {}
+
+    def __new__(cls, *p, **k):
+        self = object.__new__(cls, *p, **k)
+        self.__dict__ = cls._shared_state
+        return self
+
+    def __init__(self):
+        if self.kanwadict is None:
+            dictpath = resources.get_path(os.path.join('localization','pykakasi','kanwadict2.db'))
+            self.kanwadict = anydbm.open(dictpath,'r')
+        if self.itaijidict is  None:
+            itaijipath = resources.get_path(os.path.join('localization','pykakasi','itaijidict2.pickle'))
+            itaiji_pkl = open(itaijipath, 'rb')
+            self.itaijidict = load(itaiji_pkl)
+        if self.kanadict is None:
+            kanadictpath = resources.get_path(os.path.join('localization','pykakasi','kanadict2.pickle'))
+            kanadict_pkl = open(kanadictpath, 'rb')
+            self.kanadict = load(kanadict_pkl)
+
+    def load_jisyo(self, char):
+        try:#python2
+            key = "%04x"%ord(unicode(char))
+        except:#python3
+            key = "%04x"%ord(char)
+
+        try: #already exist?
+            table = self.jisyo_table[key]
+        except:
+            try:
+                table = self.jisyo_table[key]  = marshal.loads(decompress(self.kanwadict[key]))
+            except:
+                return None
+        return table
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/k2a.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/k2a.py
@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+#  k2a.py
+#
+# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
+#
+# Original copyright:
+# * KAKASI (Kanji Kana Simple inversion program)
+# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
+# * Copyright (C) 1992
+# * Hironobu Takahashi (takahasi@tiny.or.jp)
+# *
+# * This program is free software; you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation; either versions 2, or (at your option)
+# * any later version.
+# *
+# * This program is distributed in the hope that it will be useful
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with KAKASI, see the file COPYING.  If not, write to the Free
+# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
+# * 02111-1307, USA.
+# */
+
+from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
+
+class K2a (object):
+
+    kanwa = None
+
+    def __init__(self):
+        self.kanwa = jisyo()
+
+    def isKatakana(self, char):
+        return ( 0x30a0 < ord(char) and ord(char) < 0x30f7)
+
+    def convert(self, text):
+        Hstr = ""
+        max_len = -1
+        r = min(10, len(text)+1)
+        for x in xrange(r):
+            if text[:x] in self.kanwa.kanadict:
+                if max_len < x:
+                    max_len = x
+                    Hstr = self.kanwa.kanadict[text[:x]]
+        return (Hstr, max_len) 
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/kakasi.py
+++ b/src/calibre/ebooks/unihandecode/pykakasi/kakasi.py
@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+#  kakasi.py
+#
+# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
+#
+#  Original Copyright:
+# * KAKASI (Kanji Kana Simple inversion program)
+# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
+# * Copyright (C) 1992
+# * Hironobu Takahashi (takahasi@tiny.or.jp)
+# *
+# * This program is free software; you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation; either versions 2, or (at your option)
+# * any later version.
+# *
+# * This program is distributed in the hope that it will be useful
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with KAKASI, see the file COPYING.  If not, write to the Free
+# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
+# * 02111-1307, USA.
+# */
+
+from calibre.ebooks.unihandecode.pykakasi.j2h import J2H
+from calibre.ebooks.unihandecode.pykakasi.h2a import H2a
+from calibre.ebooks.unihandecode.pykakasi.k2a import K2a
+
+class kakasi(object):
+
+    j2h = None
+    h2a = None
+    k2a = None
+
+    def __init__(self):
+        self.j2h = J2H()
+        self.h2a = H2a()
+        self.k2a = K2a()
+
+
+    def do(self, text):
+        otext =  ''
+        i = 0
+        while True:
+            if i >= len(text):
+                break
+
+            if self.j2h.isKanji(text[i]):
+                (t, l) = self.j2h.convert(text[i:])
+                if l <= 0:
+                    otext  = otext + text[i]
+                    i = i + 1
+                    continue
+                i = i + l
+                m = 0
+                tmptext = ""
+                while True:
+                    if m >= len(t):
+                        break
+                    (s, n) = self.h2a.convert(t[m:])
+                    if n <= 0:
+                        break
+                    m = m + n
+                    tmptext = tmptext+s
+                if i >= len(text):
+                    otext = otext + tmptext.capitalize()
+                else:
+                    otext = otext + tmptext.capitalize() +' '
+            elif self.h2a.isHiragana(text[i]):
+                tmptext = ''
+                while True:
+                    (t, l) = self.h2a.convert(text[i:])
+                    tmptext = tmptext+t
+                    i = i + l
+                    if i >= len(text):
+                        otext = otext + tmptext
+                        break
+                    elif not self.h2a.isHiragana(text[i]):
+                        otext = otext + tmptext + ' '
+                        break
+            elif self.k2a.isKatakana(text[i]):
+                tmptext = ''
+                while True:
+                    (t, l) = self.k2a.convert(text[i:])
+                    tmptext = tmptext+t
+                    i = i + l
+                    if i >= len(text):
+                        otext = otext + tmptext
+                        break
+                    elif not self.k2a.isKatakana(text[i]):
+                        otext = otext + tmptext + ' '
+                        break
+            else:
+                otext  = otext + text[i]
+                i += 1
+
+        return otext
+
--- a/src/calibre/ebooks/unihandecode/pykakasi/kakasidict.utf8
+++ b/src/calibre/ebooks/unihandecode/pykakasi/kakasidict.utf8
--- a/src/calibre/ebooks/unihandecode/pykakasi/kanadict.utf8
+++ b/src/calibre/ebooks/unihandecode/pykakasi/kanadict.utf8
@ -0,0 +1,317 @@
+;; Kana-Alphabet mapping dictionary
+;;
+;;  To use this mapping table, 
+;;    you should unicode normalize NKFC form.
+;;
+;; basic mapping
+;;
+a ァ
+a ア
+ba バ
+bba ッバ
+bbe ッベ
+bbi ッビ
+bbo ッボ
+bbu ッブ
+bbya ッビャ
+bbyo ッビョ
+bbyu ッビュ
+be ベ
+bi ビ
+bo ボ
+bu ブ
+bya ビャ
+byo ビョ
+byu ビュ
+cha チャ
+che チェ
+chi チ
+cho チョ
+chu チュ
+da ダ
+dda ッダ
+dde ッデ
+ddo ッド
+de デ
+di ディ
+do ド
+e ェ
+e エ
+e ヱ
+fa ファ
+fe フェ
+ffa ッファ
+ffe ッフェ
+ffi ッフィ
+ffo ッフォ
+ffu ッフ
+fi フィ
+fo フォ
+fu フ
+ga ガ
+ge ゲ
+gga ッガ
+gge ッゲ
+ggi ッギ
+ggo ッゴ
+ggu ッグ
+ggya ッギャ
+ggyo ッギョ
+ggyu ッギュ
+gi ギ
+go ゴ
+gu グ
+gya グャ
+gyo ギョ
+gyu ギゥ
+ha ハ
+he ヘ
+hha ッハ
+hhe ッヘ
+hhi ッヒ
+hho ッホ
+hhya ッヒャ
+hhyo ッヒョ
+hhyu ッヒュ
+hi ヒ
+ho ホ
+hya ヒャ
+hyo ヒョ
+hyu ヒュ
+i ィ
+i イ
+i ヰ
+ja ジャ
+ja ヂャ
+ji ジ
+ji ヂ
+jja ッジャ
+jji ッジ
+jji ッヂ
+jjo ッジョ
+jju ッジュ
+jjya ッヂャ
+jjyo ッヂョ
+jjyu ッヂュ
+jo ジョ
+jo ヂョ
+ju ジュ
+ju ヂュ
+ka カ
+ka ヵ
+ke ケ
+ke ヶ
+ki キ
+kka ッカ
+kke ッケ
+kki ッキ
+kko ッコ
+kku ック
+kkya ッキャ
+kkyo ッキョ
+kkyu ッキュ
+ko コ
+ku ク
+kya キァ
+kyo キォ
+kyu キゥ
+ma マ
+me メ
+mi ミ
+mo モ
+mu ム
+mya ミャ
+myo ミョ
+myu ミュ
+n ン
+n'a ンア
+n'e ンエ
+n'i ンイ
+n'o ンオ
+n'u ンウ
+na ナ
+ne ネ
+ni ニ
+no ノ
+nu ヌ
+nya ニャ
+nyo ニョ
+nyu ニュ
+o ォ
+o オ
+pa パ
+pe ペ
+pi ピ
+po ポ
+ppa ッパ
+ppe ッペ
+ppi ッピ
+ppo ッポ
+ppu ップ
+ppya ッピャ
+ppyo ッピョ
+ppyu ッピュ
+pu プ
+pya ピャ
+pyo ピョ
+pyu ピュ
+ra ラ
+re レ
+ri リ
+ro ロ
+rra ッラ
+rre ッレ
+rri ッリ
+rro ッロ
+rru ッル
+rrya ッリャ
+rryo ッリョ
+rryu ッリュ
+ru ル
+rya リャ
+ryo リョ
+ryu リュ
+sa サ
+se セ
+sha シャ
+shi シ
+sho ショ
+shu シュ
+so ソ
+ssa ッサ
+sse ッセ
+ssha ッシャ
+sshi ッシ
+ssho ッショ
+sshu ッシュ
+sso ッソ
+ssu ッス
+su ス
+ta タ
+tcha ッチャ
+tchi ッチ
+tcho ッチョ
+tchu ッチュ
+te テ
+to ト
+tsu ッ
+tsu ツ
+tta ッタ
+tte ッテ
+tto ット
+ttsu ッツ
+u ゥ
+u ウ
+va ヴァ
+ve ヴェ
+vi ヴィ
+vo ヴォ
+vu ヴ
+vva ッヴァ
+vve ッヴェ
+vvi ッヴィ
+vvo ッヴォ
+vvu ッヴ
+wa ヮ
+wa ワ
+wo ヲ
+ya ャ
+ya ヤ
+yo ョ
+yo ヨ
+yu ュ
+yu ユ
+yya ッヤ
+yyo ッヨ
+yyu ッユ
+za ザ
+ze ゼ
+zo ゾ
+zu ズ
+zu ヅ
+zza ッザ
+zzo ッゾ
+zzu ッズ
+zzu ッヅ
+;;
+;; extended characters
+;;
+;;
+;; gairai terms
+;;
+all オール
+algrism アルゴリズム
+answer アンサー
+base ベース
+begineer ビギナー
+connection コネクション
+contents コンテンツ
+creator クリエーター
+comic コミック
+comics コミックス
+culture カルチャー
+debug デバッグ
+debugging デバッギング
+design デザイン
+digital デジタル
+dillenma ジレンマ
+directory ディレクトリ
+disk ディスク
+document ドキュメント
+download ダウンロード
+electric エレクトリック
+facebook フェイスブック
+firefox ファイアーフォックス
+folder フォルダ
+format フォーマット
+forum フォーラム
+fox フォックス
+free フリー
+gnome ノーム
+gnu グヌー
+gozilla ゴジラ
+guide ガイド
+harvard ハーバード
+help ヘルプ
+highlight ハイライト
+japan ジャパン
+journal ジャーナル
+library ライブラリ
+line ライン
+love ラヴ
+love ラブ
+mail メール
+main メイン
+mystery ミステリ
+mozilla モジラ
+network ネットワーク
+next ネクスト
+new ニュー
+news ニュース
+native ネイティブ
+online オンライン
+open オープン
+professional プロフェッショナル
+profile プロファイル
+programmer プログラマ
+sample サンプル
+series シリーズ
+share シェア
+social ソーシャル
+society ソサエティ
+software ソフトウエア
+source ソース
+street ストリート
+system システム
+tag タグ
+text テキスト
+thunderbird サンダーバード
+training トレーニング
+twitter ツイッター
+unicode ユニコード
+wall ウオール
+wall ウォール
+welcome ウェルカム
+welcome ウエルカム
+wikinomics ウィキノミクス
+york ヨーク
--- a/src/calibre/ebooks/unihandecode/unicodepoints.py
+++ b/src/calibre/ebooks/unihandecode/unicodepoints.py
--- a/src/calibre/ebooks/unihandecode/unidecoder.py
+++ b/src/calibre/ebooks/unihandecode/unidecoder.py
@ -1,12 +1,16 @@
 # -*- coding: utf-8 -*-

 __license__ = 'GPL 3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
 __docformat__ = 'restructuredtext en'

 '''
-Decode unicode text to an ASCII representation of the text. Transliterate
-unicode characters to ASCII.
+Decode unicode text to an ASCII representation of the text in Chinese.
+Transliterate unicode characters to ASCII based on chinese pronounce.
+
+derived from John's unidecode library.
+
+Copyright(c) 2009, John Schember

 Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
 is based on the perl module Text::Unidecode
@ -55,29 +59,20 @@ it under the same terms as Perl itself.
 '''

 import re
-
-from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
-from calibre.constants import preferred_encoding
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES

 class Unidecoder(object):

+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
    def decode(self, text):
-        '''
-        Tranliterate the string from unicode characters to ASCII.
-        '''
-        # The keys for CODEPOINTS is unicode characters, we want to be sure the
-        # input text is unicode.
-        if not isinstance(text, unicode):
-            try:
-                text = unicode(text)
-            except:
-                try:
-                    text = text.decode(preferred_encoding)
-                except:
-                    text = text.decode('utf-8', 'replace')
        # Replace characters larger than 127 with their ASCII equivelent.
-        return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
-            text)
+        return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)

    def replace_point(self, codepoint):
        '''
@ -87,7 +82,7 @@ class Unidecoder(object):
            # Split the unicode character xABCD into parts 0xAB and 0xCD.
            # 0xAB represents the group within CODEPOINTS to query and 0xCD
            # represents the position in the list of characters for the group.
-            return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
+            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
                codepoint)]
        except:
            return '?'
@ -97,12 +92,18 @@ class Unidecoder(object):
        Find what group character is a part of.
        '''
        # Code groups withing CODEPOINTS take the form 'xAB'
-        return u'x%02x' % (ord(unicode(character)) >> 8)
+        try:#python2
+            return 'x%02x' % (ord(unicode(character)) >> 8)
+        except:
+            return 'x%02x' % (ord(character) >> 8)

    def grouped_point(self, character):
        '''
        Return the location the replacement character is in the list for a
        the group character is a part of.
        '''
-        return ord(unicode(character)) & 255
+        try:#python2
+            return ord(unicode(character)) & 255
+        except:
+            return ord(character) & 255

--- a/src/calibre/ebooks/unihandecode/vncodepoints.py
+++ b/src/calibre/ebooks/unihandecode/vncodepoints.py
--- a/src/calibre/ebooks/unihandecode/vndecoder.py
+++ b/src/calibre/ebooks/unihandecode/vndecoder.py
@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Vietnamese.
+
+'''
+
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+
+class Vndecoder(Unidecoder):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
--- a/src/calibre/ebooks/unihandecode/zhcodepoints.py
+++ b/src/calibre/ebooks/unihandecode/zhcodepoints.py
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@ -6,12 +6,12 @@ meaning as possible.
 import os
 from math import ceil

-from calibre.ebooks.unidecode.unidecoder import Unidecoder
 from calibre import sanitize_file_name
 from calibre.constants import preferred_encoding, iswindows
-udc = Unidecoder()
+from calibre.utils.localization import get_udc

 def ascii_text(orig):
+    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except:
--- a/src/calibre/utils/localization.py
+++ b/src/calibre/utils/localization.py
@ -169,3 +169,13 @@ def set_qt_translator(translator):
                return translator.load(p)
    return False

+_udc = None
+
+def get_udc():
+    global _udc
+    if _udc is None:
+        from calibre.ebooks.unihandecode import Unihandecoder
+        _udc = Unihandecoder(lang=get_lang())
+    return _udc
+
+