From 59154c422cad716b41ab1b1c8a9b35ab63233b2e Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 30 Jan 2011 16:21:30 +0900 Subject: [PATCH] port recent unihandecode development branch now kana is converted with dictionary lookup unihandecode always normalize text in Unicode standard manner --- resources/kanadict2.pickle | Bin 0 -> 6809 bytes src/calibre/ebooks/unihandecode/__init__.py | 13 +- .../ebooks/unihandecode/pykakasi/jisyo.py | 5 + .../ebooks/unihandecode/pykakasi/k2a.py | 144 +----------------- 4 files changed, 15 insertions(+), 147 deletions(-) create mode 100644 resources/kanadict2.pickle diff --git a/resources/kanadict2.pickle b/resources/kanadict2.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f44a36859d779c2c1a9da7e3a6774c6199578d4b GIT binary patch literal 6809 zcmZ8m1$Y}r6HS?!nVT{*Gc&r}m35ZRl69Jq&W>Gj?!wHRG>#%Gj>DWZX?|vAru&(h zX=Zj;>&y2wiT1sDv$Hg_x9#2gE!gXVmZOIa8QOMHCZ_Y9NW0(+@aG)Iq7}mpnMynDi%6gNBtvgGsy;j%hi7gk4^-HR$l* zSQF>6B-%!9;5cDBf*!wERB*iRHLA7jMH=6<8q)Gipc^R$I6cyhba@D8xGA2&lXa#{ zVm?o^b(Zn9QTqzuY-c+wi5qZ^*p|!P0i0{bF*b>6aGnUVKAdk*T826d7k3#7@&e;{ ztNdHZ`*5N2Oi|R~A{#{r7aJe%a*vmYqJuiTw5i}Jufb))w6`m`+zIM%g%N~srGsm5 zm4OvpEx(dzx5QjkaE)<1TXyPBxK=`GL(hZjntjF*_j$cg?d>#IH)uC;EdN$3A>5cU zZoo~}IDngt@mM`i9^8_$t;4O>HiX-Z?L@IH`Ea`dCTSpqJIo4i)d45b4!BdsTJg~L zDsYz(Ocg<1!QCy#$);bC#H*5MEgcfi!+R3Eo^b`lNHX5~Xk z$L+C}{V3$N0K*P=+(_OPNrw+l=)O1))Zs~C3*`DKV^`CCDR|o1^}#d7PQkOruA%J$ zcuucIw3Sahh5x*CUdb!;pvA3}Z0Z+er82y$UrbRAc*#-$yzF`&fLBb<0lb=0)ZjI% zQ1H4@tl?d+M3JLipV4qkX>=yE3Y?);$(k*|Cn1Taa}M()=yp|BC1^7v+$brh)btVU zf}Bv5N&tCn6umFgKx=9eRjgAEhR|+eqK(#of)?fjC_1Mi+69|(l zg-~{a8h{ZfK>3wKTXX;lDkjBKPT58Wlxs>=5^L7a`BhbfusMHRoEp$6^H(U)M0KU8 zI*hPX2;EI8-o)9eNBZsT^x=&r!~SaUrWGl8%f!4piPpi}qNu7IyrW@WkACPCxSO`Z zNK-SYrqF+5!?u6^L6-=VQn_y^jk|L@Ao~yWaSG7^$qmC>d~@k6v3a z0!Hg8igkAHQZUB#zZ}Mz{uPYVcF}TEakvmC@krXwFkYzEyayAE(F#+Dq!OAMI7iOi zN|-3!7yS$>?1o83u}W4br(m)HcIqYbV2T0OX^Rl1ngA&etbl34mWF#U-Pmo?b^*-L zKs-)>Y5+5h#Wn%jeVFAu^}}rADTFx=UJi2&?87|cuUF(*1@q0?%2d<&SRiZLq5Kjo zG`6GUU(Vf&jPVp%k)^O$4pXkgwSI~2P*)wDgQ(~%HDS<>8h}3Ov@?KZ&b9$M548ud z+@W>QnXo;CeiOws5oZ)M3@}T8vJWfFU2Jl~`;C$|rS6@av;ib-z%+AnVYk3#s#6~@@)cShzz2;TO-UU-BrMX2{%}(s zt%8pToTCHqQ6twUg6hM^46x3eJid7*@eJ`!_neO#TJQVL5I)iL8*hS73foEWr}P>| zRG;ZaIvzf)H(=;zJ@|}SjF|~t{^AKdqo0+`lJRnMVWm6W=Zu<`U#_E{mkH0A>*yDZ zexlsxX#;JCFUn$;ysip-$pj?VurEtMc^dy$%wkeHn=Gk`ivZV|ueKaaL!_iuqAPAb@{0 z1<^+Mx4?x$0RM6D2KcYQxts_8bMSf`M&Ck4;Nf1}&_#-tW*zQr$y_ePeQYC%cIhz$ zxNl0Z1^2TyJv|EdPm!DP083UX3J*+?oA4k@=5hfZY#UT8CAikSkB1CZ#N+W$Yn06f zcvy-Yhlg9TO1bTb6gd@-v}B&{l1JG_61}H~<>Apzz^4Mnde|A@SSSO{}UOd@Kvss0wq{y*&sv|u-Ek%yO(=A!8dU!^P z9F1pMl6Eehl_E#s+1!?hsOsW5DRv>AYo+;ofaj&i1$e$C%ViITr^xwufh98;4=+rS z^Y9`|=JOt2Y#Vg(l1Xg~@sgBaFk;NoGR4IYrLK zTP)c|W4|>;&cfR)SuXo{yKT@jkPNTG!#h%fm3XJMLGkk*n}wM|${(ZP5KzMnL!4M~4cct@s$Xc~e&C zWda{}ihfbhNqoX#I(bvKPcqhgz^AMudQA!N>68IqZtxjvKnGIcv)t16h4oTI zJk!#dyP)v7JzdoC`5`VsWSJI4b#W1ztjZTbe1Tbh`0DiVMgFay0rZnyFW=ANY4Z1Y ze2M8pI924sQr5$ak6C<~L7k<^UkzVr`t$MC6xWBZH93W^JC3qi4HdU}prvkw8E)u# zo5z(ho{Oyv@dD(%Vi~jC(yw&8I6C?)=a1Y$^q7z*#-P<#MQr1?ey!|HqJH`4qR%)Q z)p*P?rynf4O2t5BG4HIKPYp}-#TtF$SR5-$s zB&=a~Q|Mz4|JLhZoqhIk$L9TeZ>mTb>xnd~id@rvjJzlLvFHb-mvwr$X(rkLt) zf4Z;A_;NnJ!xqicBE#>sqP8{T#A?J8_$sGRN(|`oy4Q1Qz~f_n^?5NVv|yiD3Af1Y|Ywe&rIP)^SbjM zPPJr(p6sSMF}=0Z?j6MG3~?0nxmv^-)|v{w^hl+Cro+YvMwNJ$!zK$xi#Xe16Evo9 z4rBV`pAOB$B%(XM9GbaI@H2N<@>>JS0`r`DidF|WpE0|V{6W&d1q^YxIeN0fg?k#* zanTTi5Et)hP{Sod3>3c0Eu9$WNVinPmZn6U60pyTC~qrV=0pobMAwAco}xM~A0i5| zzNbM0`-d0=*x*)kh)|rRM?LpeH!prNWgCTP9dXfP+me>X+l;-mzk;3k_Vw z20VdO7Xn<}G>=z`3o5K@7}IMuRX!2*nH`5~nc(Hf`?-9P{#0?D)9drsrEtB&^zosW zgAESTPjCefH!`OC;cd+MQQoC+lQpJiDcsBroq75Ff9SI^ZeflC%F-KFD_x4aaVtae lzl+Rc7iIlk+{P$RZCg;Jy@uPlt@F)x88+PzcMuj{_J6B%aozv` literal 0 HcmV?d00001 diff --git a/src/calibre/ebooks/unihandecode/__init__.py b/src/calibre/ebooks/unihandecode/__init__.py index e864439629..72406245c6 100644 --- a/src/calibre/ebooks/unihandecode/__init__.py +++ b/src/calibre/ebooks/unihandecode/__init__.py @@ -15,6 +15,7 @@ Copyright(c) 2009, John Schember Tranliterate the string from unicode characters to ASCII in Chinese and others. ''' +import unicodedata class Unihandecoder(object): preferred_encoding = None @@ -48,12 +49,6 @@ class Unihandecoder(object): text = text.decode('utf-8', 'replace') except: # python3, str is unicode pass - return self.decoder.decode(text) - -def unidecode(text): - ''' - backword compatibility to unidecode - ''' - from calibre.ebooks.unihandecode.unidecoder import Unidecoder - decoder = Unihandecoder() - return decoder.decode(text) + #at first unicode normalize it. (see Unicode standards) + ntext = unicodedata.normalize('NFKC',text) + return self.decoder.decode(ntext) diff --git a/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py b/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py index 99dc36633d..df471f2b0a 100644 --- a/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py +++ b/src/calibre/ebooks/unihandecode/pykakasi/jisyo.py @@ -12,6 +12,7 @@ import calibre.utils.resources as resources class jisyo (object): kanwadict = None itaijidict = None + kanadict = None jisyo_table = {} def __init__(self): @@ -22,6 +23,10 @@ class jisyo (object): itaijipath = resources.get_path('itaijidict2.pickle') itaiji_pkl = open(itaijipath, 'rb') self.itaijidict = load(itaiji_pkl) + if self.kanadict is None: + kanadictpath = resources.get_path('kanadict2.pickle') + kanadict_pkl = open(kanadictpath, 'rb') + self.kanadict = load(kanadict_pkl) def load_jisyo(self, char): try:#python2 diff --git a/src/calibre/ebooks/unihandecode/pykakasi/k2a.py b/src/calibre/ebooks/unihandecode/pykakasi/k2a.py index 409c67bc33..2a42fbbf09 100644 --- a/src/calibre/ebooks/unihandecode/pykakasi/k2a.py +++ b/src/calibre/ebooks/unihandecode/pykakasi/k2a.py @@ -29,142 +29,10 @@ from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo class K2a (object): - K2a_table = { - u"\u30a1":"a", u"\u30a2":"a", - u"\u30a3":"i", u"\u30a4":"i", - u"\u30a5":"u", u"\u30a6":"u", - u"\u30a6\u309b":"vu", u"\u30a6\u309b\u30a1":"va", - u"\u30a6\u309b\u30a3":"vi", u"\u30a6\u309b\u30a7":"ve", - u"\u30a6\u309b\u30a9":"vo", - u"\u30a7":"e", u"\u30a8":"e", - u"\u30a9":"o", u"\u30aa":"o", + kanwa = None - u"\u30ab":"ka", u"\u30ac":"ga", - u"\u30ad":"ki", u"\u30ad\u30a1":"kya", - u"\u30ad\u30a5":"kyu", u"\u30ad\u30a9":"kyo", - u"\u30ae":"gi", u"\u30b0\u30e3":"gya", - u"\u30ae\u30a5":"gyu", u"\u30ae\u30e7":"gyo", - u"\u30af":"ku", u"\u30b0":"gu", - u"\u30b1":"ke", u"\u30b2":"ge", - u"\u30b3":"ko", u"\u30b4":"go", - - u"\u30b5":"sa", u"\u30b6":"za", - u"\u30b7":"shi", u"\u30b7\u30e3":"sha", - u"\u30b7\u30e5":"shu", u"\u30b7\u30e7":"sho", - u"\u30b8":"ji", u"\u30b8\u30e3":"ja", - u"\u30b8\u30e5":"ju", u"\u30b8\u30e7":"jo", - u"\u30b9":"su", u"\u30ba":"zu", - u"\u30bb":"se", u"\u30bc":"ze", - u"\u30bd":"so", u"\u30be":"zo", - - u"\u30bf":"ta", u"\u30c0":"da", - u"\u30c1":"chi", u"\u30c1\u30a7":"che", u"\u30c1\u30e3":"cha", - u"\u30c1\u30e5":"chu", u"\u30c1\u30e7":"cho", - u"\u30c2":"ji", u"\u30c2\u30e3":"ja", - u"\u30c2\u30e5":"ju", u"\u30c2\u30e7":"jo", - - u"\u30c3":"tsu", - u"\u30c3\u30a6\u309b":"vvu", - u"\u30c3\u30a6\u309b\u30a1":"vva", - u"\u30c3\u30a6\u309b\u30a3":"vvi", - u"\u30c3\u30a6\u309b\u30a7":"vve", - u"\u30c3\u30a6\u309b\u30a9":"vvo", - u"\u30c3\u30ab":"kka", u"\u30c3\u30ac":"gga", - u"\u30c3\u30ad":"kki", u"\u30c3\u30ad\u30e3":"kkya", - u"\u30c3\u30ad\u30e5":"kkyu", u"\u30c3\u30ad\u30e7":"kkyo", - u"\u30c3\u30ae":"ggi", u"\u30c3\u30ae\u30e3":"ggya", - u"\u30c3\u30ae\u30e5":"ggyu", u"\u30c3\u30ae\u30e7":"ggyo", - u"\u30c3\u30af":"kku", u"\u30c3\u30b0":"ggu", - u"\u30c3\u30b1":"kke", u"\u30c3\u30b2":"gge", - u"\u30c3\u30b3":"kko", u"\u30c3\u30b4":"ggo", - u"\u30c3\u30b5":"ssa", u"\u30c3\u30b6":"zza", - u"\u30c3\u30b7":"sshi", u"\u30c3\u30b7\u30e3":"ssha", - u"\u30c3\u30b7\u30e5":"sshu", u"\u30c3\u30b7\u30e7":"ssho", - u"\u30c3\u30b8":"jji", u"\u30c3\u30b8\u30e3":"jja", - u"\u30c3\u30b8\u30e5":"jju", u"\u30c3\u30b8\u30e7":"jjo", - u"\u30c3\u30b9":"ssu", u"\u30c3\u30ba":"zzu", - u"\u30c3\u30bb":"sse", u"\u30c3\u30be":"zze", - u"\u30c3\u30bd":"sso", u"\u30c3\u30be":"zzo", - u"\u30c3\u30bf":"tta", u"\u30c3\u30c0":"dda", - u"\u30c3\u30c1":"tchi", u"\u30c3\u30c1\u30e3":"tcha", - u"\u30c3\u30c1\u30e5":"tchu", u"\u30c3\u30c1\u30e7":"tcho", - u"\u30c3\u30c2":"jji", u"\u30c3\u30c2\u30e3":"jjya", - u"\u30c3\u30c2\u30e5":"jjyu", u"\u30c3\u30c2\u30e7":"jjyo", - u"\u30c3\u30c4":"ttsu", u"\u30c3\u30c5":"zzu", - u"\u30c3\u30c6":"tte", u"\u30c3\u30c7":"dde", - u"\u30c3\u30c8":"tto", u"\u30c3\u30c9":"ddo", - u"\u30c3\u30cf":"hha", u"\u30c3\u30d0":"bba", - u"\u30c3\u30d1":"ppa", - u"\u30c3\u30d2":"hhi", u"\u30c3\u30d2\u30e3":"hhya", - u"\u30c3\u30d2\u30e5":"hhyu", u"\u30c3\u30d2\u30e7":"hhyo", - u"\u30c3\u30d3":"bbi", u"\u30c3\u30d3\u30e3":"bbya", - u"\u30c3\u30d3\u30e5":"bbyu", u"\u30c3\u30d3\u30e7":"bbyo", - u"\u30c3\u30d4":"ppi", u"\u30c3\u30d4\u30e3":"ppya", - u"\u30c3\u30d4\u30e5":"ppyu", u"\u30c3\u30d4\u30e7":"ppyo", - u"\u30c3\u30d5":"ffu", u"\u30c3\u30d5\u30a1":"ffa", - u"\u30c3\u30d5\u30a3":"ffi", u"\u30c3\u30d5\u30a7":"ffe", - u"\u30c3\u30d5\u30a9":"ffo", - u"\u30c3\u30d6":"bbu", u"\u30c3\u30d7":"ppu", - u"\u30c3\u30d8":"hhe", u"\u30c3\u30d9":"bbe", - u"\u30c3\u30da":"ppe", - u"\u30c3\u30db":"hho", u"\u30c3\u30dc":"bbo", - u"\u30c3\u30dd":"ppo", - u"\u30c3\u30e4":"yya", u"\u30c3\u30e6":"yyu", - u"\u30c3\u30e8":"yyo", - u"\u30c3\u30e9":"rra", u"\u30c3\u30ea":"rri", - u"\u30c3\u30ea\u30e3":"rrya", u"\u30c3\u30ea\u30e5":"rryu", - u"\u30c3\u30ea\u30e7":"rryo", - u"\u30c3\u30eb":"rru", u"\u30c3\u30ec":"rre", - u"\u30c3\u30ed":"rro", - - u"\u30c4":"tsu", u"\u30c5":"zu", - u"\u30c6":"te", u"\u30c7":"de", u"\u30c7\u30a3":"di", - u"\u30c8":"to", u"\u30c9":"do", - - u"\u30ca":"na", - u"\u30cb":"ni", u"\u30cb\u30e3":"nya", - u"\u30cb\u30e5":"nyu", u"\u30cb\u30e7":"nyo", - u"\u30cc":"nu", u"\u30cd":"ne", u"\u30ce":"no", - - u"\u30cf":"ha", u"\u30d0":"ba", u"\u30d1":"pa", - u"\u30d2":"hi", u"\u30d2\u30e3":"hya", - u"\u30d2\u30e5":"hyu", u"\u30d2\u30e7":"hyo", - u"\u30d3":"bi", u"\u30d3\u30e3":"bya", - u"\u30d3\u30e5":"byu", u"\u30d3\u30e7":"byo", - u"\u30d4":"pi", u"\u30d4\u30e3":"pya", - u"\u30d4\u30e5":"pyu", u"\u30d4\u30e7":"pyo", - u"\u30d5":"fu", u"\u30d5\u30a1":"fa", - u"\u30d5\u30a3":"fi", u"\u30d5\u30a7":"fe", - u"\u30d5\u30a9":"fo", - u"\u30d6":"bu", u"\u30d7":"pu", - u"\u30d8":"he", u"\u30d9":"be", u"\u30da":"pe", - u"\u30db":"ho", u"\u30dc":"bo", u"\u30dd":"po", - - u"\u30de":"ma", - u"\u30df":"mi", u"\u30df\u30e3":"mya", - u"\u30df\u30e5":"myu", u"\u30df\u30e7":"myo", - u"\u30e0":"mu", u"\u30e1":"me", u"\u30e2":"mo", - - u"\u30e3":"ya", u"\u30e4":"ya", - u"\u30e5":"yu", u"\u30e6":"yu", - u"\u30e7":"yo", u"\u30e8":"yo", - - u"\u30e9":"ra", - u"\u30ea":"ri", u"\u30ea\u30e3":"rya", - u"\u30ea\u30e5":"ryu", u"\u30ea\u30e7":"ryo", - u"\u30eb":"ru", u"\u30ec":"re", u"\u30ed":"ro", - - u"\u30ee":"wa", u"\u30ef":"wa", - u"\u30f0":"i", u"\u30f1":"e", - u"\u30f2":"wo", u"\u30f3":"n", - - u"\u30f3\u30a2":"n'a", u"\u30f3\u30a4":"n'i", - u"\u30f3\u30a6":"n'u", u"\u30f3\u30a8":"n'e", - u"\u30f3\u30aa":"n'o", - - u"\u30f4":"vu", u"\u30f5":"ka", - u"\u30f6":"ke", - } + def __init__(self): + self.kanwa = jisyo() def isKatakana(self, char): return ( 0x30a0 < ord(char) and ord(char) < 0x30f7) @@ -172,11 +40,11 @@ class K2a (object): def convert(self, text): Hstr = "" max_len = -1 - r = min(4, len(text)+1) + r = min(10, len(text)+1) for x in xrange(r): - if text[:x] in self.K2a_table: + if text[:x] in self.kanwa.kanadict: if max_len < x: max_len = x - Hstr = self.K2a_table[text[:x]] + Hstr = self.kanwa.kanadict[text[:x]] return (Hstr, max_len)