diff --git a/src/calibre/ebooks/unihandecode/jadecoder.py b/src/calibre/ebooks/unihandecode/jadecoder.py index 59d250b585..e8d7eac51f 100644 --- a/src/calibre/ebooks/unihandecode/jadecoder.py +++ b/src/calibre/ebooks/unihandecode/jadecoder.py @@ -17,70 +17,26 @@ This functionality is owned by Kakasi Japanese processing engine. Copyright (c) 2010 Hiroshi Miura ''' -from ctypes import * import os, re from unihandecode.unidecoder import Unidecoder from unihandecode.unicodepoints import CODEPOINTS from unihandecode.jacodepoints import CODEPOINTS as JACODES +from unihandecode.pykakasi import kakasi class Jadecoder(Unidecoder): - - #kakasi instance kakasi = None - codepoints = {} def __init__(self): self.codepoints = CODEPOINTS self.codepoints.update(JACODES) - - try: - kakasi_location = os.environ['KAKASILIB'] - # May be "C:\\kakasi\\lib\\" in WIndows - # "/opt/local/lib/" in Mac OS X - kakasi_location = re.sub(r'/$', '', kakasi_location) - except KeyError: - if os.name is "nt": - kakasi_location = "c:\\kakasi\\lib\\kakasi" - elif os.name is "Darwin": - kakasi_location = 'opt/local/lib' - else: - kakasi_location = '' - - if os.name is "nt": - kakasi_libname = "kakasi" - elif os.name is "Darwin": - kakasi_libname = "libkakasi.dylib" - elif os.name is "posix": - kakasi_libname = "libkakasi.so.2" - else: - self.kakasi = None - return - - try: - self.kakasi = CDLL(os.path.join(kakasi_location, kakasi_libname)) - except: - self.kakasi = None + self.kakasi = kakasi() def decode(self, text): - - # if there is not kakasi library, we fall down to use unidecode - if self.kakasi is None: - return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text) - - numopt = 9 - argArray = c_char_p * numopt - args = argArray( c_char_p("kakasi") - ,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea") - ,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s") - ,c_char_p("-ieuc") - ) - self.kakasi.kakasi_getopt_argv(numopt, args) - kakasi_do = self.kakasi.kakasi_do - kakasi_do.restype = c_char_p - try: - cstr = c_char_p(text.encode("eucjp")) - return kakasi_do(cstr).decode("eucjp") + dummy = text.encode('euc-jp') # test if text contains only Japanese and ASCII characters. + result=self.kakasi.do(text) + return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result) except: return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text) + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/__init__.py b/src/calibre/ebooks/unihandecode/pykakasi/__init__.py new file mode 100644 index 0000000000..0afc71bcbf --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/__init__.py @@ -0,0 +1,4 @@ +from kakasi import kakasi + +__all__ = ["pykakasi"] + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/h2a.py b/src/calibre/ebooks/unihandecode/pykakasi/h2a.py new file mode 100644 index 0000000000..4c398d07c7 --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/h2a.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +# h2a.py +# +# Copyright 2011 Hiroshi Miura +# +# Original copyright: +# * KAKASI (Kanji Kana Simple inversion program) +# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $ +# * Copyright (C) 1992 +# * Hironobu Takahashi (takahasi@tiny.or.jp) +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either versions 2, or (at your option) +# * any later version. +# * +# * This program is distributed in the hope that it will be useful +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with KAKASI, see the file COPYING. If not, write to the Free +# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA +# * 02111-1307, USA. +# */ + +from jisyo import jisyo + +class H2a (object): + + H2a_table = { + u"\u3041":"a", u"\u3042":"a", + u"\u3043":"i", u"\u3044":"i", + u"\u3045":"u", u"\u3046":"u", + u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va", + u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve", + u"\u3046\u309b\u3049":"vo", + u"\u3047":"e", u"\u3048":"e", + u"\u3049":"o", u"\u304a":"o", + + u"\u304b":"ka", u"\u304c":"ga", + u"\u304d":"ki", u"\u304d\u3041":"kya", + u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo", + u"\u304e":"gi", u"\u3050\u3083":"gya", + u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo", + u"\u304f":"ku", u"\u3050":"gu", + u"\u3051":"ke", u"\u3052":"ge", + u"\u3053":"ko", u"\u3054":"go", + + u"\u3055":"sa", u"\u3056":"za", + u"\u3057":"shi", u"\u3057\u3083":"sha", + u"\u3057\u3085":"shu", u"\u3057\u3087":"sho", + u"\u3058":"ji", u"\u3058\u3083":"ja", + u"\u3058\u3085":"ju", u"\u3058\u3087":"jo", + u"\u3059":"su", u"\u305a":"zu", + u"\u305b":"se", u"\u305c":"ze", + u"\u305d":"so", u"\u305e":"zo", + + u"\u305f":"ta", u"\u3060":"da", + u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha", + u"\u3061\u3085":"chu", u"\u3061\u3087":"cho", + u"\u3062":"ji", u"\u3062\u3083":"ja", + u"\u3062\u3085":"ju", u"\u3062\u3087":"jo", + + u"\u3063":"tsu", + u"\u3063\u3046\u309b":"vvu", + u"\u3063\u3046\u309b\u3041":"vva", + u"\u3063\u3046\u309b\u3043":"vvi", + u"\u3063\u3046\u309b\u3047":"vve", + u"\u3063\u3046\u309b\u3049":"vvo", + u"\u3063\u304b":"kka", u"\u3063\u304c":"gga", + u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya", + u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo", + u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya", + u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo", + u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu", + u"\u3063\u3051":"kke", u"\u3063\u3052":"gge", + u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo", + u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza", + u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha", + u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho", + u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja", + u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo", + u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu", + u"\u3063\u305b":"sse", u"\u3063\u305e":"zze", + u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo", + u"\u3063\u305f":"tta", u"\u3063\u3060":"dda", + u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha", + u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho", + u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya", + u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo", + u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu", + u"\u3063\u3066":"tte", u"\u3063\u3067":"dde", + u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo", + u"\u3063\u306f":"hha", u"\u3063\u3070":"bba", + u"\u3063\u3071":"ppa", + u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya", + u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo", + u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya", + u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo", + u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya", + u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo", + u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa", + u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe", + u"\u3063\u3075\u3049":"ffo", + u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu", + u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe", + u"\u3063\u307a":"ppe", + u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo", + u"\u3063\u307d":"ppo", + u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu", + u"\u3063\u3088":"yyo", + u"\u3063\u3089":"rra", u"\u3063\u308a":"rri", + u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu", + u"\u3063\u308a\u3087":"rryo", + u"\u3063\u308b":"rru", u"\u3063\u308c":"rre", + u"\u3063\u308d":"rro", + + u"\u3064":"tsu", u"\u3065":"zu", + u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di", + u"\u3068":"to", u"\u3069":"do", + + u"\u306a":"na", + u"\u306b":"ni", u"\u306b\u3083":"nya", + u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo", + u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no", + + u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa", + u"\u3072":"hi", u"\u3072\u3083":"hya", + u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo", + u"\u3073":"bi", u"\u3073\u3083":"bya", + u"\u3073\u3085":"byu", u"\u3073\u3087":"byo", + u"\u3074":"pi", u"\u3074\u3083":"pya", + u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo", + u"\u3075":"fu", u"\u3075\u3041":"fa", + u"\u3075\u3043":"fi", u"\u3075\u3047":"fe", + u"\u3075\u3049":"fo", + u"\u3076":"bu", u"\u3077":"pu", + u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe", + u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po", + + u"\u307e":"ma", + u"\u307f":"mi", u"\u307f\u3083":"mya", + u"\u307f\u3085":"myu", u"\u307f\u3087":"myo", + u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo", + + u"\u3083":"ya", u"\u3084":"ya", + u"\u3085":"yu", u"\u3086":"yu", + u"\u3087":"yo", u"\u3088":"yo", + + u"\u3089":"ra", + u"\u308a":"ri", u"\u308a\u3083":"rya", + u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo", + u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro", + + u"\u308e":"wa", u"\u308f":"wa", + u"\u3090":"i", u"\u3091":"e", + u"\u3092":"wo", u"\u3093":"n", + + u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i", + u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e", + u"\u3093\u304a":"n'o", + } + + def isHiragana(self, char): + return ( 0x3040 < ord(char) and ord(char) < 0x3094) + + def convert(self, text): + Hstr = "" + max_len = -1 + r = min(4, len(text)+1) + for x in xrange(r): + if text[:x] in self.H2a_table: + if max_len < x: + max_len = x + Hstr = self.H2a_table[text[:x]] + return (Hstr, max_len) + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/itaijidict2.pickle b/src/calibre/ebooks/unihandecode/pykakasi/itaijidict2.pickle new file mode 100644 index 0000000000..f6a8f2c4d9 Binary files /dev/null and b/src/calibre/ebooks/unihandecode/pykakasi/itaijidict2.pickle differ diff --git a/src/calibre/ebooks/unihandecode/pykakasi/j2h.py b/src/calibre/ebooks/unihandecode/pykakasi/j2h.py new file mode 100644 index 0000000000..d0f6066446 --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/j2h.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# j2h.py +# +# Copyright 2011 Hiroshi Miura +# +# Original Copyright: +# * KAKASI (Kanji Kana Simple inversion program) +# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $ +# * Copyright (C) 1992 +# * Hironobu Takahashi (takahasi@tiny.or.jp) +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either versions 2, or (at your option) +# * any later version. +# * +# * This program is distributed in the hope that it will be useful +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with KAKASI, see the file COPYING. If not, write to the Free +# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA +# * 02111-1307, USA. +# */ + +from jisyo import jisyo +import re + +class J2H (object): + + kanwa = None + + cl_table = [ + "","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", + "aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k", + "g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc", + "d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n", + "n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b", + "p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl", + "rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k", + "k", "", "", "", "", "", "", "", "", ""] + + def __init__(self): + self.kanwa = jisyo() + + def isKanji(self, c): + return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e) + + def isCletter(self, l, c): + if (ord(u"ぁ") <= ord(c) and ord(c) <= 0x309f) and ( l in self.cl_table[ord(c) - ord(u"ぁ")-1]): + return True + return False + + def itaiji_conv(self, text): + r = [] + for c in text: + if c in self.kanwa.itaijidict: + r.append(c) + for c in r: + text = re.sub(c, self.kanwa.itaijidict[c], text) + return text + + def convert(self, text): + max_len = 0 + match_more = False + Hstr = "" + table = self.kanwa.load_jisyo(text[0]) + if table is None: + return ("", 0) + for (k,v) in table.iteritems(): + length = len(k) + if len(text) >= length: + if text.startswith(k): + for (yomi, tail) in v: + if tail is '': + if max_len < length: + Hstr = yomi + max_len = length + elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]): + Hstr=''.join([yomi,text[length]]) + max_len = length+1 + return (Hstr, max_len) diff --git a/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py b/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py new file mode 100644 index 0000000000..d86c94b545 --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# jisyo.py +# +# Copyright 2011 Hiroshi Miura +from cPickle import load +import anydbm,marshal +from zlib import decompress +import os + +class jisyo (object): + kanwadict = None + itaijidict = None + jisyo_table = {} + + def __init__(self): + if self.kanwadict is None: + dictpath = os.path.join('unihandecode','pykakasi','kanwadict2.db') + self.kanwadict = anydbm.open(dictpath,'r') + if self.itaijidict is None: + itaijipath = os.path.join('unihandecode','pykakasi','itaijidict2.pickle') + itaiji_pkl = open(itaijipath, 'rb') + self.itaijidict = load(itaiji_pkl) + + def load_jisyo(self, char): + try:#python2 + key = "%04x"%ord(unicode(char)) + except:#python3 + key = "%04x"%ord(char) + + try: #already exist? + table = self.jisyo_table[key] + except: + try: + table = self.jisyo_table[key] = marshal.loads(decompress(self.kanwadict[key])) + except: + return None + return table + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/k2a.py b/src/calibre/ebooks/unihandecode/pykakasi/k2a.py new file mode 100644 index 0000000000..61592fe920 --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/k2a.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- +# k2a.py +# +# Copyright 2011 Hiroshi Miura +# +# Original copyright: +# * KAKASI (Kanji Kana Simple inversion program) +# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $ +# * Copyright (C) 1992 +# * Hironobu Takahashi (takahasi@tiny.or.jp) +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either versions 2, or (at your option) +# * any later version. +# * +# * This program is distributed in the hope that it will be useful +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with KAKASI, see the file COPYING. If not, write to the Free +# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA +# * 02111-1307, USA. +# */ + +from jisyo import jisyo + +class K2a (object): + + K2a_table = { + u"\u30a1":"a", u"\u30a2":"a", + u"\u30a3":"i", u"\u30a4":"i", + u"\u30a5":"u", u"\u30a6":"u", + u"\u30a6\u309b":"vu", u"\u30a6\u309b\u30a1":"va", + u"\u30a6\u309b\u30a3":"vi", u"\u30a6\u309b\u30a7":"ve", + u"\u30a6\u309b\u30a9":"vo", + u"\u30a7":"e", u"\u30a8":"e", + u"\u30a9":"o", u"\u30aa":"o", + + u"\u30ab":"ka", u"\u30ac":"ga", + u"\u30ad":"ki", u"\u30ad\u30a1":"kya", + u"\u30ad\u30a5":"kyu", u"\u30ad\u30a9":"kyo", + u"\u30ae":"gi", u"\u30b0\u30e3":"gya", + u"\u30ae\u30a5":"gyu", u"\u30ae\u30e7":"gyo", + u"\u30af":"ku", u"\u30b0":"gu", + u"\u30b1":"ke", u"\u30b2":"ge", + u"\u30b3":"ko", u"\u30b4":"go", + + u"\u30b5":"sa", u"\u30b6":"za", + u"\u30b7":"shi", u"\u30b7\u30e3":"sha", + u"\u30b7\u30e5":"shu", u"\u30b7\u30e7":"sho", + u"\u30b8":"ji", u"\u30b8\u30e3":"ja", + u"\u30b8\u30e5":"ju", u"\u30b8\u30e7":"jo", + u"\u30b9":"su", u"\u30ba":"zu", + u"\u30bb":"se", u"\u30bc":"ze", + u"\u30bd":"so", u"\u30be":"zo", + + u"\u30bf":"ta", u"\u30c0":"da", + u"\u30c1":"chi", u"\u30c1\u30a7":"che", u"\u30c1\u30e3":"cha", + u"\u30c1\u30e5":"chu", u"\u30c1\u30e7":"cho", + u"\u30c2":"ji", u"\u30c2\u30e3":"ja", + u"\u30c2\u30e5":"ju", u"\u30c2\u30e7":"jo", + + u"\u30c3":"tsu", + u"\u30c3\u30a6\u309b":"vvu", + u"\u30c3\u30a6\u309b\u30a1":"vva", + u"\u30c3\u30a6\u309b\u30a3":"vvi", + u"\u30c3\u30a6\u309b\u30a7":"vve", + u"\u30c3\u30a6\u309b\u30a9":"vvo", + u"\u30c3\u30ab":"kka", u"\u30c3\u30ac":"gga", + u"\u30c3\u30ad":"kki", u"\u30c3\u30ad\u30e3":"kkya", + u"\u30c3\u30ad\u30e5":"kkyu", u"\u30c3\u30ad\u30e7":"kkyo", + u"\u30c3\u30ae":"ggi", u"\u30c3\u30ae\u30e3":"ggya", + u"\u30c3\u30ae\u30e5":"ggyu", u"\u30c3\u30ae\u30e7":"ggyo", + u"\u30c3\u30af":"kku", u"\u30c3\u30b0":"ggu", + u"\u30c3\u30b1":"kke", u"\u30c3\u30b2":"gge", + u"\u30c3\u30b3":"kko", u"\u30c3\u30b4":"ggo", + u"\u30c3\u30b5":"ssa", u"\u30c3\u30b6":"zza", + u"\u30c3\u30b7":"sshi", u"\u30c3\u30b7\u30e3":"ssha", + u"\u30c3\u30b7\u30e5":"sshu", u"\u30c3\u30b7\u30e7":"ssho", + u"\u30c3\u30b8":"jji", u"\u30c3\u30b8\u30e3":"jja", + u"\u30c3\u30b8\u30e5":"jju", u"\u30c3\u30b8\u30e7":"jjo", + u"\u30c3\u30b9":"ssu", u"\u30c3\u30ba":"zzu", + u"\u30c3\u30bb":"sse", u"\u30c3\u30be":"zze", + u"\u30c3\u30bd":"sso", u"\u30c3\u30be":"zzo", + u"\u30c3\u30bf":"tta", u"\u30c3\u30c0":"dda", + u"\u30c3\u30c1":"tchi", u"\u30c3\u30c1\u30e3":"tcha", + u"\u30c3\u30c1\u30e5":"tchu", u"\u30c3\u30c1\u30e7":"tcho", + u"\u30c3\u30c2":"jji", u"\u30c3\u30c2\u30e3":"jjya", + u"\u30c3\u30c2\u30e5":"jjyu", u"\u30c3\u30c2\u30e7":"jjyo", + u"\u30c3\u30c4":"ttsu", u"\u30c3\u30c5":"zzu", + u"\u30c3\u30c6":"tte", u"\u30c3\u30c7":"dde", + u"\u30c3\u30c8":"tto", u"\u30c3\u30c9":"ddo", + u"\u30c3\u30cf":"hha", u"\u30c3\u30d0":"bba", + u"\u30c3\u30d1":"ppa", + u"\u30c3\u30d2":"hhi", u"\u30c3\u30d2\u30e3":"hhya", + u"\u30c3\u30d2\u30e5":"hhyu", u"\u30c3\u30d2\u30e7":"hhyo", + u"\u30c3\u30d3":"bbi", u"\u30c3\u30d3\u30e3":"bbya", + u"\u30c3\u30d3\u30e5":"bbyu", u"\u30c3\u30d3\u30e7":"bbyo", + u"\u30c3\u30d4":"ppi", u"\u30c3\u30d4\u30e3":"ppya", + u"\u30c3\u30d4\u30e5":"ppyu", u"\u30c3\u30d4\u30e7":"ppyo", + u"\u30c3\u30d5":"ffu", u"\u30c3\u30d5\u30a1":"ffa", + u"\u30c3\u30d5\u30a3":"ffi", u"\u30c3\u30d5\u30a7":"ffe", + u"\u30c3\u30d5\u30a9":"ffo", + u"\u30c3\u30d6":"bbu", u"\u30c3\u30d7":"ppu", + u"\u30c3\u30d8":"hhe", u"\u30c3\u30d9":"bbe", + u"\u30c3\u30da":"ppe", + u"\u30c3\u30db":"hho", u"\u30c3\u30dc":"bbo", + u"\u30c3\u30dd":"ppo", + u"\u30c3\u30e4":"yya", u"\u30c3\u30e6":"yyu", + u"\u30c3\u30e8":"yyo", + u"\u30c3\u30e9":"rra", u"\u30c3\u30ea":"rri", + u"\u30c3\u30ea\u30e3":"rrya", u"\u30c3\u30ea\u30e5":"rryu", + u"\u30c3\u30ea\u30e7":"rryo", + u"\u30c3\u30eb":"rru", u"\u30c3\u30ec":"rre", + u"\u30c3\u30ed":"rro", + + u"\u30c4":"tsu", u"\u30c5":"zu", + u"\u30c6":"te", u"\u30c7":"de", u"\u30c7\u30a3":"di", + u"\u30c8":"to", u"\u30c9":"do", + + u"\u30ca":"na", + u"\u30cb":"ni", u"\u30cb\u30e3":"nya", + u"\u30cb\u30e5":"nyu", u"\u30cb\u30e7":"nyo", + u"\u30cc":"nu", u"\u30cd":"ne", u"\u30ce":"no", + + u"\u30cf":"ha", u"\u30d0":"ba", u"\u30d1":"pa", + u"\u30d2":"hi", u"\u30d2\u30e3":"hya", + u"\u30d2\u30e5":"hyu", u"\u30d2\u30e7":"hyo", + u"\u30d3":"bi", u"\u30d3\u30e3":"bya", + u"\u30d3\u30e5":"byu", u"\u30d3\u30e7":"byo", + u"\u30d4":"pi", u"\u30d4\u30e3":"pya", + u"\u30d4\u30e5":"pyu", u"\u30d4\u30e7":"pyo", + u"\u30d5":"fu", u"\u30d5\u30a1":"fa", + u"\u30d5\u30a3":"fi", u"\u30d5\u30a7":"fe", + u"\u30d5\u30a9":"fo", + u"\u30d6":"bu", u"\u30d7":"pu", + u"\u30d8":"he", u"\u30d9":"be", u"\u30da":"pe", + u"\u30db":"ho", u"\u30dc":"bo", u"\u30dd":"po", + + u"\u30de":"ma", + u"\u30df":"mi", u"\u30df\u30e3":"mya", + u"\u30df\u30e5":"myu", u"\u30df\u30e7":"myo", + u"\u30e0":"mu", u"\u30e1":"me", u"\u30e2":"mo", + + u"\u30e3":"ya", u"\u30e4":"ya", + u"\u30e5":"yu", u"\u30e6":"yu", + u"\u30e7":"yo", u"\u30e8":"yo", + + u"\u30e9":"ra", + u"\u30ea":"ri", u"\u30ea\u30e3":"rya", + u"\u30ea\u30e5":"ryu", u"\u30ea\u30e7":"ryo", + u"\u30eb":"ru", u"\u30ec":"re", u"\u30ed":"ro", + + u"\u30ee":"wa", u"\u30ef":"wa", + u"\u30f0":"i", u"\u30f1":"e", + u"\u30f2":"wo", u"\u30f3":"n", + + u"\u30f3\u30a2":"n'a", u"\u30f3\u30a4":"n'i", + u"\u30f3\u30a6":"n'u", u"\u30f3\u30a8":"n'e", + u"\u30f3\u30aa":"n'o", + + u"\u30f4":"vu", u"\u30f5":"ka", + u"\u30f6":"ke", + } + + def isKatakana(self, char): + return ( 0x30a0 < ord(char) and ord(char) < 0x30f7) + + def convert(self, text): + Hstr = "" + max_len = -1 + r = min(4, len(text)+1) + for x in xrange(r): + if text[:x] in self.K2a_table: + if max_len < x: + max_len = x + Hstr = self.K2a_table[text[:x]] + return (Hstr, max_len) + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/kakasi.py b/src/calibre/ebooks/unihandecode/pykakasi/kakasi.py new file mode 100644 index 0000000000..26a1b1f4de --- /dev/null +++ b/src/calibre/ebooks/unihandecode/pykakasi/kakasi.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# kakasi.py +# +# Copyright 2011 Hiroshi Miura +# +# Original Copyright: +# * KAKASI (Kanji Kana Simple inversion program) +# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $ +# * Copyright (C) 1992 +# * Hironobu Takahashi (takahasi@tiny.or.jp) +# * +# * This program is free software; you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation; either versions 2, or (at your option) +# * any later version. +# * +# * This program is distributed in the hope that it will be useful +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with KAKASI, see the file COPYING. If not, write to the Free +# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA +# * 02111-1307, USA. +# */ + +import re +import sys, os +from j2h import J2H +from h2a import H2a +from k2a import K2a + +class kakasi(object): + + j2h = None + h2a = None + k2a = None + + def __init__(self, mode="-J2a -H2a -K2a"): + #now we don't allow mode selection + self.j2h = J2H() + self.h2a = H2a() + self.k2a = K2a() + return + + def do(self, text): + otext = '' + i = 0 + while True: + if i >= len(text): + break + + if self.j2h.isKanji(text[i]): + (t, l) = self.j2h.convert(text[i:]) + if l <= 0: + break + i = i + l + m = 0 + tmptext = "" + while True: + if m >= len(t): + break + (s, n) = self.h2a.convert(t[m:]) + if n <= 0: + break + m = m + n + tmptext = tmptext+s + if i >= len(text): + otext = otext + tmptext.capitalize() + else: + otext = otext + tmptext.capitalize() +' ' + elif self.h2a.isHiragana(text[i]): + tmptext = '' + while True: + (t, l) = self.h2a.convert(text[i:]) + tmptext = tmptext+t + i = i + l + if i >= len(text): + otext = otext + tmptext + break + elif not self.h2a.isHiragana(text[i]): + otext = otext + tmptext + ' ' + break + elif self.k2a.isKatakana(text[i]): + tmptext = '' + while True: + (t, l) = self.k2a.convert(text[i:]) + tmptext = tmptext+t + i = i + l + if i >= len(text): + otext = otext + tmptext + break + elif not self.k2a.isKatakana(text[i]): + otext = otext + tmptext + ' ' + break + else: + otext = otext + text[i] + i += 1 + + return otext + diff --git a/src/calibre/ebooks/unihandecode/pykakasi/kanwadict2.db b/src/calibre/ebooks/unihandecode/pykakasi/kanwadict2.db new file mode 100644 index 0000000000..dfd48cfdf5 Binary files /dev/null and b/src/calibre/ebooks/unihandecode/pykakasi/kanwadict2.db differ diff --git a/src/calibre/ebooks/unihandecode/unicodepoints.py b/src/calibre/ebooks/unihandecode/unicodepoints.py index 7c992d69b6..05cd51cadb 100644 --- a/src/calibre/ebooks/unihandecode/unicodepoints.py +++ b/src/calibre/ebooks/unihandecode/unicodepoints.py @@ -1040,7 +1040,7 @@ CODEPOINTS = { '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', ], 'x21': [ - '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', '', '', '', 'g', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '[?]', '[?]', '[?]', '[?]', '[?]',