unihandecode: import recent development version

now don't depend external library
This commit is contained in:
Hiroshi Miura 2011-01-28 13:01:10 +09:00
parent cf646b98e4
commit 868d43af68
10 changed files with 596 additions and 51 deletions

View File

@ -17,70 +17,26 @@ This functionality is owned by Kakasi Japanese processing engine.
Copyright (c) 2010 Hiroshi Miura Copyright (c) 2010 Hiroshi Miura
''' '''
from ctypes import *
import os, re import os, re
from unihandecode.unidecoder import Unidecoder from unihandecode.unidecoder import Unidecoder
from unihandecode.unicodepoints import CODEPOINTS from unihandecode.unicodepoints import CODEPOINTS
from unihandecode.jacodepoints import CODEPOINTS as JACODES from unihandecode.jacodepoints import CODEPOINTS as JACODES
from unihandecode.pykakasi import kakasi
class Jadecoder(Unidecoder): class Jadecoder(Unidecoder):
#kakasi instance
kakasi = None kakasi = None
codepoints = {} codepoints = {}
def __init__(self): def __init__(self):
self.codepoints = CODEPOINTS self.codepoints = CODEPOINTS
self.codepoints.update(JACODES) self.codepoints.update(JACODES)
self.kakasi = kakasi()
try:
kakasi_location = os.environ['KAKASILIB']
# May be "C:\\kakasi\\lib\\" in WIndows
# "/opt/local/lib/" in Mac OS X
kakasi_location = re.sub(r'/$', '', kakasi_location)
except KeyError:
if os.name is "nt":
kakasi_location = "c:\\kakasi\\lib\\kakasi"
elif os.name is "Darwin":
kakasi_location = 'opt/local/lib'
else:
kakasi_location = ''
if os.name is "nt":
kakasi_libname = "kakasi"
elif os.name is "Darwin":
kakasi_libname = "libkakasi.dylib"
elif os.name is "posix":
kakasi_libname = "libkakasi.so.2"
else:
self.kakasi = None
return
try:
self.kakasi = CDLL(os.path.join(kakasi_location, kakasi_libname))
except:
self.kakasi = None
def decode(self, text): def decode(self, text):
# if there is not kakasi library, we fall down to use unidecode
if self.kakasi is None:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
numopt = 9
argArray = c_char_p * numopt
args = argArray( c_char_p("kakasi")
,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
,c_char_p("-ieuc")
)
self.kakasi.kakasi_getopt_argv(numopt, args)
kakasi_do = self.kakasi.kakasi_do
kakasi_do.restype = c_char_p
try: try:
cstr = c_char_p(text.encode("eucjp")) dummy = text.encode('euc-jp') # test if text contains only Japanese and ASCII characters.
return kakasi_do(cstr).decode("eucjp") result=self.kakasi.do(text)
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
except: except:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text) return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)

View File

@ -0,0 +1,4 @@
from kakasi import kakasi
__all__ = ["pykakasi"]

View File

@ -0,0 +1,179 @@
# -*- coding: utf-8 -*-
# h2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from jisyo import jisyo
class H2a (object):
H2a_table = {
u"\u3041":"a", u"\u3042":"a",
u"\u3043":"i", u"\u3044":"i",
u"\u3045":"u", u"\u3046":"u",
u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va",
u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve",
u"\u3046\u309b\u3049":"vo",
u"\u3047":"e", u"\u3048":"e",
u"\u3049":"o", u"\u304a":"o",
u"\u304b":"ka", u"\u304c":"ga",
u"\u304d":"ki", u"\u304d\u3041":"kya",
u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo",
u"\u304e":"gi", u"\u3050\u3083":"gya",
u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo",
u"\u304f":"ku", u"\u3050":"gu",
u"\u3051":"ke", u"\u3052":"ge",
u"\u3053":"ko", u"\u3054":"go",
u"\u3055":"sa", u"\u3056":"za",
u"\u3057":"shi", u"\u3057\u3083":"sha",
u"\u3057\u3085":"shu", u"\u3057\u3087":"sho",
u"\u3058":"ji", u"\u3058\u3083":"ja",
u"\u3058\u3085":"ju", u"\u3058\u3087":"jo",
u"\u3059":"su", u"\u305a":"zu",
u"\u305b":"se", u"\u305c":"ze",
u"\u305d":"so", u"\u305e":"zo",
u"\u305f":"ta", u"\u3060":"da",
u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha",
u"\u3061\u3085":"chu", u"\u3061\u3087":"cho",
u"\u3062":"ji", u"\u3062\u3083":"ja",
u"\u3062\u3085":"ju", u"\u3062\u3087":"jo",
u"\u3063":"tsu",
u"\u3063\u3046\u309b":"vvu",
u"\u3063\u3046\u309b\u3041":"vva",
u"\u3063\u3046\u309b\u3043":"vvi",
u"\u3063\u3046\u309b\u3047":"vve",
u"\u3063\u3046\u309b\u3049":"vvo",
u"\u3063\u304b":"kka", u"\u3063\u304c":"gga",
u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya",
u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo",
u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya",
u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo",
u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu",
u"\u3063\u3051":"kke", u"\u3063\u3052":"gge",
u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo",
u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza",
u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha",
u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho",
u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja",
u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo",
u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu",
u"\u3063\u305b":"sse", u"\u3063\u305e":"zze",
u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo",
u"\u3063\u305f":"tta", u"\u3063\u3060":"dda",
u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha",
u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho",
u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya",
u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo",
u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu",
u"\u3063\u3066":"tte", u"\u3063\u3067":"dde",
u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo",
u"\u3063\u306f":"hha", u"\u3063\u3070":"bba",
u"\u3063\u3071":"ppa",
u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya",
u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo",
u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya",
u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo",
u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya",
u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo",
u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa",
u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe",
u"\u3063\u3075\u3049":"ffo",
u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu",
u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe",
u"\u3063\u307a":"ppe",
u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo",
u"\u3063\u307d":"ppo",
u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu",
u"\u3063\u3088":"yyo",
u"\u3063\u3089":"rra", u"\u3063\u308a":"rri",
u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu",
u"\u3063\u308a\u3087":"rryo",
u"\u3063\u308b":"rru", u"\u3063\u308c":"rre",
u"\u3063\u308d":"rro",
u"\u3064":"tsu", u"\u3065":"zu",
u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di",
u"\u3068":"to", u"\u3069":"do",
u"\u306a":"na",
u"\u306b":"ni", u"\u306b\u3083":"nya",
u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo",
u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no",
u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa",
u"\u3072":"hi", u"\u3072\u3083":"hya",
u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo",
u"\u3073":"bi", u"\u3073\u3083":"bya",
u"\u3073\u3085":"byu", u"\u3073\u3087":"byo",
u"\u3074":"pi", u"\u3074\u3083":"pya",
u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo",
u"\u3075":"fu", u"\u3075\u3041":"fa",
u"\u3075\u3043":"fi", u"\u3075\u3047":"fe",
u"\u3075\u3049":"fo",
u"\u3076":"bu", u"\u3077":"pu",
u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe",
u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po",
u"\u307e":"ma",
u"\u307f":"mi", u"\u307f\u3083":"mya",
u"\u307f\u3085":"myu", u"\u307f\u3087":"myo",
u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo",
u"\u3083":"ya", u"\u3084":"ya",
u"\u3085":"yu", u"\u3086":"yu",
u"\u3087":"yo", u"\u3088":"yo",
u"\u3089":"ra",
u"\u308a":"ri", u"\u308a\u3083":"rya",
u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo",
u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro",
u"\u308e":"wa", u"\u308f":"wa",
u"\u3090":"i", u"\u3091":"e",
u"\u3092":"wo", u"\u3093":"n",
u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i",
u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e",
u"\u3093\u304a":"n'o",
}
def isHiragana(self, char):
return ( 0x3040 < ord(char) and ord(char) < 0x3094)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(4, len(text)+1)
for x in xrange(r):
if text[:x] in self.H2a_table:
if max_len < x:
max_len = x
Hstr = self.H2a_table[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# j2h.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from jisyo import jisyo
import re
class J2H (object):
kanwa = None
cl_table = [
"","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow",
"aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k",
"g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc",
"d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n",
"n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b",
"p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl",
"rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k",
"k", "", "", "", "", "", "", "", "", ""]
def __init__(self):
self.kanwa = jisyo()
def isKanji(self, c):
return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e)
def isCletter(self, l, c):
if (ord(u"") <= ord(c) and ord(c) <= 0x309f) and ( l in self.cl_table[ord(c) - ord(u"")-1]):
return True
return False
def itaiji_conv(self, text):
r = []
for c in text:
if c in self.kanwa.itaijidict:
r.append(c)
for c in r:
text = re.sub(c, self.kanwa.itaijidict[c], text)
return text
def convert(self, text):
max_len = 0
match_more = False
Hstr = ""
table = self.kanwa.load_jisyo(text[0])
if table is None:
return ("", 0)
for (k,v) in table.iteritems():
length = len(k)
if len(text) >= length:
if text.startswith(k):
for (yomi, tail) in v:
if tail is '':
if max_len < length:
Hstr = yomi
max_len = length
elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]):
Hstr=''.join([yomi,text[length]])
max_len = length+1
return (Hstr, max_len)

View File

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# jisyo.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
from cPickle import load
import anydbm,marshal
from zlib import decompress
import os
class jisyo (object):
kanwadict = None
itaijidict = None
jisyo_table = {}
def __init__(self):
if self.kanwadict is None:
dictpath = os.path.join('unihandecode','pykakasi','kanwadict2.db')
self.kanwadict = anydbm.open(dictpath,'r')
if self.itaijidict is None:
itaijipath = os.path.join('unihandecode','pykakasi','itaijidict2.pickle')
itaiji_pkl = open(itaijipath, 'rb')
self.itaijidict = load(itaiji_pkl)
def load_jisyo(self, char):
try:#python2
key = "%04x"%ord(unicode(char))
except:#python3
key = "%04x"%ord(char)
try: #already exist?
table = self.jisyo_table[key]
except:
try:
table = self.jisyo_table[key] = marshal.loads(decompress(self.kanwadict[key]))
except:
return None
return table

View File

@ -0,0 +1,182 @@
# -*- coding: utf-8 -*-
# k2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from jisyo import jisyo
class K2a (object):
K2a_table = {
u"\u30a1":"a", u"\u30a2":"a",
u"\u30a3":"i", u"\u30a4":"i",
u"\u30a5":"u", u"\u30a6":"u",
u"\u30a6\u309b":"vu", u"\u30a6\u309b\u30a1":"va",
u"\u30a6\u309b\u30a3":"vi", u"\u30a6\u309b\u30a7":"ve",
u"\u30a6\u309b\u30a9":"vo",
u"\u30a7":"e", u"\u30a8":"e",
u"\u30a9":"o", u"\u30aa":"o",
u"\u30ab":"ka", u"\u30ac":"ga",
u"\u30ad":"ki", u"\u30ad\u30a1":"kya",
u"\u30ad\u30a5":"kyu", u"\u30ad\u30a9":"kyo",
u"\u30ae":"gi", u"\u30b0\u30e3":"gya",
u"\u30ae\u30a5":"gyu", u"\u30ae\u30e7":"gyo",
u"\u30af":"ku", u"\u30b0":"gu",
u"\u30b1":"ke", u"\u30b2":"ge",
u"\u30b3":"ko", u"\u30b4":"go",
u"\u30b5":"sa", u"\u30b6":"za",
u"\u30b7":"shi", u"\u30b7\u30e3":"sha",
u"\u30b7\u30e5":"shu", u"\u30b7\u30e7":"sho",
u"\u30b8":"ji", u"\u30b8\u30e3":"ja",
u"\u30b8\u30e5":"ju", u"\u30b8\u30e7":"jo",
u"\u30b9":"su", u"\u30ba":"zu",
u"\u30bb":"se", u"\u30bc":"ze",
u"\u30bd":"so", u"\u30be":"zo",
u"\u30bf":"ta", u"\u30c0":"da",
u"\u30c1":"chi", u"\u30c1\u30a7":"che", u"\u30c1\u30e3":"cha",
u"\u30c1\u30e5":"chu", u"\u30c1\u30e7":"cho",
u"\u30c2":"ji", u"\u30c2\u30e3":"ja",
u"\u30c2\u30e5":"ju", u"\u30c2\u30e7":"jo",
u"\u30c3":"tsu",
u"\u30c3\u30a6\u309b":"vvu",
u"\u30c3\u30a6\u309b\u30a1":"vva",
u"\u30c3\u30a6\u309b\u30a3":"vvi",
u"\u30c3\u30a6\u309b\u30a7":"vve",
u"\u30c3\u30a6\u309b\u30a9":"vvo",
u"\u30c3\u30ab":"kka", u"\u30c3\u30ac":"gga",
u"\u30c3\u30ad":"kki", u"\u30c3\u30ad\u30e3":"kkya",
u"\u30c3\u30ad\u30e5":"kkyu", u"\u30c3\u30ad\u30e7":"kkyo",
u"\u30c3\u30ae":"ggi", u"\u30c3\u30ae\u30e3":"ggya",
u"\u30c3\u30ae\u30e5":"ggyu", u"\u30c3\u30ae\u30e7":"ggyo",
u"\u30c3\u30af":"kku", u"\u30c3\u30b0":"ggu",
u"\u30c3\u30b1":"kke", u"\u30c3\u30b2":"gge",
u"\u30c3\u30b3":"kko", u"\u30c3\u30b4":"ggo",
u"\u30c3\u30b5":"ssa", u"\u30c3\u30b6":"zza",
u"\u30c3\u30b7":"sshi", u"\u30c3\u30b7\u30e3":"ssha",
u"\u30c3\u30b7\u30e5":"sshu", u"\u30c3\u30b7\u30e7":"ssho",
u"\u30c3\u30b8":"jji", u"\u30c3\u30b8\u30e3":"jja",
u"\u30c3\u30b8\u30e5":"jju", u"\u30c3\u30b8\u30e7":"jjo",
u"\u30c3\u30b9":"ssu", u"\u30c3\u30ba":"zzu",
u"\u30c3\u30bb":"sse", u"\u30c3\u30be":"zze",
u"\u30c3\u30bd":"sso", u"\u30c3\u30be":"zzo",
u"\u30c3\u30bf":"tta", u"\u30c3\u30c0":"dda",
u"\u30c3\u30c1":"tchi", u"\u30c3\u30c1\u30e3":"tcha",
u"\u30c3\u30c1\u30e5":"tchu", u"\u30c3\u30c1\u30e7":"tcho",
u"\u30c3\u30c2":"jji", u"\u30c3\u30c2\u30e3":"jjya",
u"\u30c3\u30c2\u30e5":"jjyu", u"\u30c3\u30c2\u30e7":"jjyo",
u"\u30c3\u30c4":"ttsu", u"\u30c3\u30c5":"zzu",
u"\u30c3\u30c6":"tte", u"\u30c3\u30c7":"dde",
u"\u30c3\u30c8":"tto", u"\u30c3\u30c9":"ddo",
u"\u30c3\u30cf":"hha", u"\u30c3\u30d0":"bba",
u"\u30c3\u30d1":"ppa",
u"\u30c3\u30d2":"hhi", u"\u30c3\u30d2\u30e3":"hhya",
u"\u30c3\u30d2\u30e5":"hhyu", u"\u30c3\u30d2\u30e7":"hhyo",
u"\u30c3\u30d3":"bbi", u"\u30c3\u30d3\u30e3":"bbya",
u"\u30c3\u30d3\u30e5":"bbyu", u"\u30c3\u30d3\u30e7":"bbyo",
u"\u30c3\u30d4":"ppi", u"\u30c3\u30d4\u30e3":"ppya",
u"\u30c3\u30d4\u30e5":"ppyu", u"\u30c3\u30d4\u30e7":"ppyo",
u"\u30c3\u30d5":"ffu", u"\u30c3\u30d5\u30a1":"ffa",
u"\u30c3\u30d5\u30a3":"ffi", u"\u30c3\u30d5\u30a7":"ffe",
u"\u30c3\u30d5\u30a9":"ffo",
u"\u30c3\u30d6":"bbu", u"\u30c3\u30d7":"ppu",
u"\u30c3\u30d8":"hhe", u"\u30c3\u30d9":"bbe",
u"\u30c3\u30da":"ppe",
u"\u30c3\u30db":"hho", u"\u30c3\u30dc":"bbo",
u"\u30c3\u30dd":"ppo",
u"\u30c3\u30e4":"yya", u"\u30c3\u30e6":"yyu",
u"\u30c3\u30e8":"yyo",
u"\u30c3\u30e9":"rra", u"\u30c3\u30ea":"rri",
u"\u30c3\u30ea\u30e3":"rrya", u"\u30c3\u30ea\u30e5":"rryu",
u"\u30c3\u30ea\u30e7":"rryo",
u"\u30c3\u30eb":"rru", u"\u30c3\u30ec":"rre",
u"\u30c3\u30ed":"rro",
u"\u30c4":"tsu", u"\u30c5":"zu",
u"\u30c6":"te", u"\u30c7":"de", u"\u30c7\u30a3":"di",
u"\u30c8":"to", u"\u30c9":"do",
u"\u30ca":"na",
u"\u30cb":"ni", u"\u30cb\u30e3":"nya",
u"\u30cb\u30e5":"nyu", u"\u30cb\u30e7":"nyo",
u"\u30cc":"nu", u"\u30cd":"ne", u"\u30ce":"no",
u"\u30cf":"ha", u"\u30d0":"ba", u"\u30d1":"pa",
u"\u30d2":"hi", u"\u30d2\u30e3":"hya",
u"\u30d2\u30e5":"hyu", u"\u30d2\u30e7":"hyo",
u"\u30d3":"bi", u"\u30d3\u30e3":"bya",
u"\u30d3\u30e5":"byu", u"\u30d3\u30e7":"byo",
u"\u30d4":"pi", u"\u30d4\u30e3":"pya",
u"\u30d4\u30e5":"pyu", u"\u30d4\u30e7":"pyo",
u"\u30d5":"fu", u"\u30d5\u30a1":"fa",
u"\u30d5\u30a3":"fi", u"\u30d5\u30a7":"fe",
u"\u30d5\u30a9":"fo",
u"\u30d6":"bu", u"\u30d7":"pu",
u"\u30d8":"he", u"\u30d9":"be", u"\u30da":"pe",
u"\u30db":"ho", u"\u30dc":"bo", u"\u30dd":"po",
u"\u30de":"ma",
u"\u30df":"mi", u"\u30df\u30e3":"mya",
u"\u30df\u30e5":"myu", u"\u30df\u30e7":"myo",
u"\u30e0":"mu", u"\u30e1":"me", u"\u30e2":"mo",
u"\u30e3":"ya", u"\u30e4":"ya",
u"\u30e5":"yu", u"\u30e6":"yu",
u"\u30e7":"yo", u"\u30e8":"yo",
u"\u30e9":"ra",
u"\u30ea":"ri", u"\u30ea\u30e3":"rya",
u"\u30ea\u30e5":"ryu", u"\u30ea\u30e7":"ryo",
u"\u30eb":"ru", u"\u30ec":"re", u"\u30ed":"ro",
u"\u30ee":"wa", u"\u30ef":"wa",
u"\u30f0":"i", u"\u30f1":"e",
u"\u30f2":"wo", u"\u30f3":"n",
u"\u30f3\u30a2":"n'a", u"\u30f3\u30a4":"n'i",
u"\u30f3\u30a6":"n'u", u"\u30f3\u30a8":"n'e",
u"\u30f3\u30aa":"n'o",
u"\u30f4":"vu", u"\u30f5":"ka",
u"\u30f6":"ke",
}
def isKatakana(self, char):
return ( 0x30a0 < ord(char) and ord(char) < 0x30f7)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(4, len(text)+1)
for x in xrange(r):
if text[:x] in self.K2a_table:
if max_len < x:
max_len = x
Hstr = self.K2a_table[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
# kakasi.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
import re
import sys, os
from j2h import J2H
from h2a import H2a
from k2a import K2a
class kakasi(object):
j2h = None
h2a = None
k2a = None
def __init__(self, mode="-J2a -H2a -K2a"):
#now we don't allow mode selection
self.j2h = J2H()
self.h2a = H2a()
self.k2a = K2a()
return
def do(self, text):
otext = ''
i = 0
while True:
if i >= len(text):
break
if self.j2h.isKanji(text[i]):
(t, l) = self.j2h.convert(text[i:])
if l <= 0:
break
i = i + l
m = 0
tmptext = ""
while True:
if m >= len(t):
break
(s, n) = self.h2a.convert(t[m:])
if n <= 0:
break
m = m + n
tmptext = tmptext+s
if i >= len(text):
otext = otext + tmptext.capitalize()
else:
otext = otext + tmptext.capitalize() +' '
elif self.h2a.isHiragana(text[i]):
tmptext = ''
while True:
(t, l) = self.h2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.h2a.isHiragana(text[i]):
otext = otext + tmptext + ' '
break
elif self.k2a.isKatakana(text[i]):
tmptext = ''
while True:
(t, l) = self.k2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.k2a.isKatakana(text[i]):
otext = otext + tmptext + ' '
break
else:
otext = otext + text[i]
i += 1
return otext

Binary file not shown.

View File

@ -1040,7 +1040,7 @@ CODEPOINTS = {
'[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]', '[?]',
], ],
'x21': [ 'x21': [
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'g', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '[?]', '[?]', '[?]', '[?]', '[?]', '', '', '', '', '', '', '', '', '', '', '', '[?]', '[?]', '[?]', '[?]', '[?]',