When converting non english texts to english, use the users currect calibre interface language. This allows japanes/korean/vietnamese characters to be correctly converted. Previously they were assumed to be Chinese. Fixes #7622 (Calibre need to switch logic when converting Unicode filename into ASCII)

This commit is contained in:
Kovid Goyal 2011-02-14 10:50:28 -07:00
commit c4f06e39af
27 changed files with 146309 additions and 3292 deletions

View File

@ -193,6 +193,33 @@ License: GPL-3
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.
Files: src/calibre/ebooks/unihandecode/pykakasi/*
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 1992, Hironobu Takahashi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.
Files: resources/kanwadict2.db
Files: resources/itaijidict2.pickle
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 1992 1993 1994, Hironobu Takahashi (takahasi@tiny.or.jp),
Copyright: 1992 1993 1994, Masahiko Sato (masahiko@sato.riec.tohoku.ac.jp),
Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.
Files: src/calibre/ebooks/unihandecode/*
Copyright: 2010-2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 2009, John Schember
Copyright: 2007, Russell Norris
Copyright: 2001, Sean M. Burke
License: GPL-3, Perl
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.
Files: src/encutils/__init__.py
Copyright: 2005-2008: Christof Hoeke
License: LGPL-3+, CC-BY-3.0

View File

@ -6,9 +6,10 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, cPickle
import os, cPickle, re, anydbm, shutil
from zlib import compress
from setup import Command, basenames
from setup import Command, basenames, __appname__
def get_opts_from_parser(parser):
def do_opt(opt):
@ -26,6 +27,9 @@ class Resources(Command):
description = 'Compile various needed calibre resources'
KAKASI_PATH = os.path.join(Command.SRC, __appname__,
'ebooks', 'unihandecode', 'pykakasi')
def run(self, opts):
scripts = {}
for x in ('console', 'gui'):
@ -101,11 +105,113 @@ class Resources(Command):
import json
json.dump(function_dict, open(dest, 'wb'), indent=4)
self.run_kakasi(opts)
def run_kakasi(self, opts):
self.records = {}
src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanwadict2.db')
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
if not self.newer(dest, src):
self.info('\tKanwadict is up to date')
else:
self.info('\tGenerating Kanwadict')
for line in open(src, "r"):
self.parsekdict(line)
self.kanwaout(dest)
src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','itaijidict2.pickle')
if not self.newer(dest, src):
self.info('\tItaijidict is up to date')
else:
self.info('\tGenerating Itaijidict')
self.mkitaiji(src, dest)
src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanadict2.pickle')
if not self.newer(dest, src):
self.info('\tKanadict is up to date')
else:
self.info('\tGenerating kanadict')
self.mkkanadict(src, dest)
return
def mkitaiji(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
dic[pair[0]] = pair[1]
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
def mkkanadict(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
(alpha, kana) = line.split(' ')
dic[kana] = alpha
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
def parsekdict(self, line):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
return
(yomi, kanji) = line.split(' ')
if ord(yomi[-1:]) <= ord('z'):
tail = yomi[-1:]
yomi = yomi[:-1]
else:
tail = ''
self.updaterec(kanji, yomi, tail)
def updaterec(self, kanji, yomi, tail):
key = "%04x"%ord(kanji[0])
if key in self.records:
if kanji in self.records[key]:
rec = self.records[key][kanji]
rec.append((yomi,tail))
self.records[key].update( {kanji: rec} )
else:
self.records[key][kanji]=[(yomi, tail)]
else:
self.records[key] = {}
self.records[key][kanji]=[(yomi, tail)]
def kanwaout(self, out):
dic = anydbm.open(out, 'c')
for (k, v) in self.records.iteritems():
dic[k] = compress(cPickle.dumps(v, -1))
dic.close()
def clean(self):
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
x = self.j(self.RESOURCES, x+'.pickle')
if os.path.exists(x):
os.remove(x)
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
if os.path.exists(kakasi):
shutil.rmtree(kakasi)

View File

@ -402,8 +402,8 @@ OptionRecommendation(name='asciiize',
'with "Mikhail Gorbachiov". Also, note that in '
'cases where there are multiple representations of a character '
'(characters shared by Chinese and Japanese for instance) the '
'representation used by the largest number of people will be '
'used (Chinese in the previous example).')%\
'representation based on the current calibre interface language will be '
'used.')%\
u'\u041c\u0438\u0445\u0430\u0438\u043b '
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
)

View File

@ -543,9 +543,9 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False):
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
html = unidecoder.decode(html)
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
@ -557,10 +557,10 @@ class HTMLPreProcessor(object):
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char)
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)
return html

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
__all__ = ["Unihandecoder"]
'''
Decode unicode text to an ASCII representation of the text.
Translate unicode characters to ASCII.
inspired from John's unidecode library.
Copyright(c) 2009, John Schember
Tranliterate the string from unicode characters to ASCII in Chinese and others.
'''
import unicodedata
class Unihandecoder(object):
preferred_encoding = None
decoder = None
def __init__(self, lang="zh", encoding='utf-8'):
self.preferred_encoding = encoding
lang = lang.lower()
if lang[:2] == u'ja':
from calibre.ebooks.unihandecode.jadecoder import Jadecoder
self.decoder = Jadecoder()
elif lang[:2] == u'kr' or lang == u'korean':
from calibre.ebooks.unihandecode.krdecoder import Krdecoder
self.decoder = Krdecoder()
elif lang[:2] == u'vn' or lang == u'vietnum':
from calibre.ebooks.unihandecode.vndecoder import Vndecoder
self.decoder = Vndecoder()
else: #zh and others
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
self.decoder = Unidecoder()
def decode(self, text):
try:
unicode # python2
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(self.preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
except: # python3, str is unicode
pass
#at first unicode normalize it. (see Unicode standards)
ntext = unicodedata.normalize('NFKC', text)
return self.decoder.decode(ntext)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,41 @@
# coding:utf8
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text for Japanese.
Translate unicode string to ASCII roman string.
API is based on the python unidecode,
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
and perl module Text::Unidecode
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
This functionality is owned by Kakasi Japanese processing engine.
Copyright (c) 2010 Hiroshi Miura
'''
import re
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
class Jadecoder(Unidecoder):
kakasi = None
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(JACODES)
self.kakasi = kakasi()
def decode(self, text):
try:
result=self.kakasi.do(text)
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
except:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text in Korean.
Based on unidecoder.
'''
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
class Krdecoder(Unidecoder):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)

View File

@ -0,0 +1,5 @@
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
kakasi
__all__ = ["pykakasi"]

View File

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
# h2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
class H2a (object):
H2a_table = {
u"\u3041":"a", u"\u3042":"a",
u"\u3043":"i", u"\u3044":"i",
u"\u3045":"u", u"\u3046":"u",
u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va",
u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve",
u"\u3046\u309b\u3049":"vo",
u"\u3047":"e", u"\u3048":"e",
u"\u3049":"o", u"\u304a":"o",
u"\u304b":"ka", u"\u304c":"ga",
u"\u304d":"ki", u"\u304d\u3041":"kya",
u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo",
u"\u304e":"gi", u"\u3050\u3083":"gya",
u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo",
u"\u304f":"ku", u"\u3050":"gu",
u"\u3051":"ke", u"\u3052":"ge",
u"\u3053":"ko", u"\u3054":"go",
u"\u3055":"sa", u"\u3056":"za",
u"\u3057":"shi", u"\u3057\u3083":"sha",
u"\u3057\u3085":"shu", u"\u3057\u3087":"sho",
u"\u3058":"ji", u"\u3058\u3083":"ja",
u"\u3058\u3085":"ju", u"\u3058\u3087":"jo",
u"\u3059":"su", u"\u305a":"zu",
u"\u305b":"se", u"\u305c":"ze",
u"\u305d":"so", u"\u305e":"zo",
u"\u305f":"ta", u"\u3060":"da",
u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha",
u"\u3061\u3085":"chu", u"\u3061\u3087":"cho",
u"\u3062":"ji", u"\u3062\u3083":"ja",
u"\u3062\u3085":"ju", u"\u3062\u3087":"jo",
u"\u3063":"tsu",
u"\u3063\u3046\u309b":"vvu",
u"\u3063\u3046\u309b\u3041":"vva",
u"\u3063\u3046\u309b\u3043":"vvi",
u"\u3063\u3046\u309b\u3047":"vve",
u"\u3063\u3046\u309b\u3049":"vvo",
u"\u3063\u304b":"kka", u"\u3063\u304c":"gga",
u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya",
u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo",
u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya",
u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo",
u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu",
u"\u3063\u3051":"kke", u"\u3063\u3052":"gge",
u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo",
u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza",
u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha",
u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho",
u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja",
u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo",
u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu",
u"\u3063\u305b":"sse", u"\u3063\u305e":"zze",
u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo",
u"\u3063\u305f":"tta", u"\u3063\u3060":"dda",
u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha",
u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho",
u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya",
u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo",
u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu",
u"\u3063\u3066":"tte", u"\u3063\u3067":"dde",
u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo",
u"\u3063\u306f":"hha", u"\u3063\u3070":"bba",
u"\u3063\u3071":"ppa",
u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya",
u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo",
u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya",
u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo",
u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya",
u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo",
u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa",
u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe",
u"\u3063\u3075\u3049":"ffo",
u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu",
u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe",
u"\u3063\u307a":"ppe",
u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo",
u"\u3063\u307d":"ppo",
u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu",
u"\u3063\u3088":"yyo",
u"\u3063\u3089":"rra", u"\u3063\u308a":"rri",
u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu",
u"\u3063\u308a\u3087":"rryo",
u"\u3063\u308b":"rru", u"\u3063\u308c":"rre",
u"\u3063\u308d":"rro",
u"\u3064":"tsu", u"\u3065":"zu",
u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di",
u"\u3068":"to", u"\u3069":"do",
u"\u306a":"na",
u"\u306b":"ni", u"\u306b\u3083":"nya",
u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo",
u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no",
u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa",
u"\u3072":"hi", u"\u3072\u3083":"hya",
u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo",
u"\u3073":"bi", u"\u3073\u3083":"bya",
u"\u3073\u3085":"byu", u"\u3073\u3087":"byo",
u"\u3074":"pi", u"\u3074\u3083":"pya",
u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo",
u"\u3075":"fu", u"\u3075\u3041":"fa",
u"\u3075\u3043":"fi", u"\u3075\u3047":"fe",
u"\u3075\u3049":"fo",
u"\u3076":"bu", u"\u3077":"pu",
u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe",
u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po",
u"\u307e":"ma",
u"\u307f":"mi", u"\u307f\u3083":"mya",
u"\u307f\u3085":"myu", u"\u307f\u3087":"myo",
u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo",
u"\u3083":"ya", u"\u3084":"ya",
u"\u3085":"yu", u"\u3086":"yu",
u"\u3087":"yo", u"\u3088":"yo",
u"\u3089":"ra",
u"\u308a":"ri", u"\u308a\u3083":"rya",
u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo",
u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro",
u"\u308e":"wa", u"\u308f":"wa",
u"\u3090":"i", u"\u3091":"e",
u"\u3092":"wo", u"\u3093":"n",
u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i",
u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e",
u"\u3093\u304a":"n'o",
}
# this class is Borg
_shared_state = {}
def __new__(cls, *p, **k):
self = object.__new__(cls, *p, **k)
self.__dict__ = cls._shared_state
return self
def isHiragana(self, char):
return ( 0x3040 < ord(char) and ord(char) < 0x3094)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(4, len(text)+1)
for x in xrange(r):
if text[:x] in self.H2a_table:
if max_len < x:
max_len = x
Hstr = self.H2a_table[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,564 @@
芦蘆
壱一
苅刈
舘館
曽曾
菟兎
島嶋
盃杯
冨富
峯峰
亘亙
弌一
乘乗
亂乱
豫予
亊事
弍二
亞亜
亰京
从従
仭仞
佛仏
來来
儘侭
伜倅
假仮
會会
做作
傳伝
僞偽
價価
儉倹
兒児
兔兎
竸競
兩両
囘回
册冊
冢塚
冩写
决決
冱冴
冰氷
况況
凉涼
處処
凾函
刄刃
刔抉
刧劫
剩剰
劍剣
劔剣
劒剣
剱剣
劑剤
辨弁
勞労
勳勲
勵励
勸勧
區区
卆卒
丗世
凖準
夘卯
卻却
卷巻
厠廁
厦廈
厮廝
厰廠
參参
雙双
咒呪
單単
噐器
營営
嚏嚔
嚴厳
囑嘱
囓齧
圀国
圈圏
國国
圍囲
圓円
團団
圖図
埀垂
埓埒
塲場
壞壊
墮堕
壓圧
壘塁
壥廛
壤壌
壯壮
壺壷
壹一
壻婿
壽寿
夂夊
夛多
梦夢
竒奇
奧奥
奬奨
侫佞
姙妊
嫻嫺
孃嬢
學学
斈学
寃冤
寇冦
寢寝
寫写
寶宝
寳宝
尅剋
將将
專専
對対
尓爾
尢尤
屆届
屬属
峽峡
嶌嶋
嵜崎
崙崘
嵳嵯
嶽岳
巛川
巵卮
帋紙
帶帯
幤幣
廐厩
廏厩
廣広
廚厨
廢廃
廳庁
廰庁
廸迪
弃棄
弉奘
彜彝
彈弾
彌弥
弯彎
徃往
徑径
從従
徠来
悳徳
恠怪
恆恒
悧俐
惡悪
惠恵
忰悴
惱悩
愼慎
愽博
慘惨
慚慙
憇憩
應応
懷懐
懴懺
戀恋
戞戛
戰戦
戲戯
拔抜
拏拿
擔担
拜拝
拂払
挾挟
搜捜
插挿
搖揺
攝摂
攪撹
據拠
擇択
擧拳
舉拳
抬擡
擴拡
攜携
攵攴
攷考
收収
效効
敕勅
敍叙
敘叙
數数
變変
斷断
旙旛
昜陽
晄晃
晉晋
晝昼
晰晢
暎映
曉暁
暸瞭
昿曠
曵曳
朖朗
朞期
霸覇
杤栃
杰傑
枩松
檜桧
條条
檮梼
梹檳
棊棋
棧桟
棕椶
楙茂
榮栄
槨椁
樂楽
權権
樞枢
樣様
樓楼
橢楕
檢検
櫻桜
鬱欝
盜盗
飮飲
歐嘔
歡歓
歸帰
殘残
殱殲
殼殻
毆殴
毓育
氣気
沒没
泪涙
濤涛
渕淵
渊淵
淨浄
淺浅
滿満
溂剌
溪渓
灌潅
滯滞
澁渋
澀渋
潛潜
濳潜
澂澄
澑溜
澤沢
濟済
濕湿
濱浜
濾滬
灣湾
烱炯
烟煙
熈煕
熏燻
燒焼
爐炉
爭争
爲為
爼俎
犁犂
犹猶
犲豺
狹狭
獎奨
默黙
獨独
獸獣
獵猟
獻献
珎珍
璢瑠
瑯琅
珱瓔
瓣弁
甞嘗
甼町
畄留
畍界
畊耕
畆畝
畧略
畫画
當当
畴疇
疊畳
疉畳
疂畳
癡痴
發発
皃猊
皈帰
皹皸
盖蓋
盡尽
蘯盪
眞真
眦眥
礦鉱
礪砺
碎砕
碯瑙
祕秘
祿禄
齋斎
禪禅
禮礼
禀稟
稱称
稻稲
稾稿
穗穂
穩穏
龝穐
穰穣
窗窓
竈竃
窰窯
竊窃
竝並
筺筐
笋筍
箟箘
筝箏
簔蓑
籠篭
籘籐
籖籤
粹粋
糺糾
絲糸
經経
總総
緜綿
縣県
縱縦
繪絵
繩縄
繼継
緕纃
續続
纖繊
纎繊
纜繿
缺欠
罐缶
罸罰
羃冪
羣群
羮羹
譱善
翆翠
翦剪
耻恥
聟婿
聨聯
聲声
聰聡
聽聴
肅粛
冐冒
脉脈
腦脳
腟膣
膓腸
膸髄
膽胆
臈臘
臟臓
臺台
與与
舊旧
舍舎
舖舗
舩船
艢檣
舮艫
艷艶
莖茎
莊荘
莵兎
菷帚
萠萌
蕚萼
蒂蔕
萬万
葢蓋
蘂蕊
蕋蕊
藪薮
藏蔵
藝芸
藥薬
蘓蘇
乕虎
號号
蠣蛎
蝨虱
蠅蝿
螢蛍
蟆蟇
蟲虫
蠏蟹
蟷螳
蟒蠎
蠶蚕
蠧蠹
蠻蛮
衂衄
衞衛
袵衽
裝装
襃褒
褝襌
覩睹
覺覚
覽覧
觀観
觧解
觸触
誡戒
謌歌
諡謚
謠謡
證証
譛譖
譯訳
譽誉
讀読
讓譲
讚賛
豐豊
貉狢
貍狸
貎猊
豼貔
貘獏
戝財
貭質
貳弐
貮弐
賤賎
賣売
贊賛
賍贓
赱走
踈疎
踴踊
躰体
軆体
軈軅
軣轟
輕軽
輙輒
輌輛
轉転
辭辞
辯弁
迯逃
逹達
逎遒
遞逓
遲遅
邊辺
邉辺
邨村
鄰隣
醉酔
醫医
釀醸
釋釈
釡釜
釼剣
銕鉄
錢銭
鎭鎮
鐵鉄
鐡鉄
鑒鑑
鑄鋳
鑛鉱
鈩鑪
鑚鑽
閇閉
濶闊
關関
阯址
陷陥
險険
隱隠
隸隷
襍雑
雜雑
靈霊
靜静
靱靭
韭韮
韲齏
韵韻
顏顔
顯顕
飃飄
餘余
餝飾
餠餅
騷騒
驅駆
驛駅
驗験
髓髄
體体
髮髪
鬪闘
鰺鯵
鰛鰮
鳬鳧
鳫鴈
鵄鴟
鵞鵝
鷄鶏
鷏鷆
鹽塩
麥麦
麸麩
麪麺
點点
黨党
皷鼓
鼡鼠
齊斉
齒歯
齡齢
龜亀
槇槙
遙遥
瑤瑶
凜凛
熙煕

View File

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
# j2h.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
import re
class J2H (object):
kanwa = None
cl_table = [
"","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow",
"aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k",
"g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc",
"d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n",
"n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b",
"p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl",
"rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k",
"k", "", "", "", "", "", "", "", "", ""]
def __init__(self):
self.kanwa = jisyo()
def isKanji(self, c):
return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e)
def isCletter(self, l, c):
if (ord(u"") <= ord(c) and ord(c) <= 0x309f) and ( l in self.cl_table[ord(c) - ord(u"")-1]):
return True
return False
def itaiji_conv(self, text):
r = []
for c in text:
if c in self.kanwa.itaijidict:
r.append(c)
for c in r:
text = re.sub(c, self.kanwa.itaijidict[c], text)
return text
def convert(self, text):
max_len = 0
Hstr = ""
table = self.kanwa.load_jisyo(text[0])
if table is None:
return ("", 0)
for (k,v) in table.iteritems():
length = len(k)
if len(text) >= length:
if text.startswith(k):
for (yomi, tail) in v:
if tail is '':
if max_len < length:
Hstr = yomi
max_len = length
elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]):
Hstr=''.join([yomi,text[length]])
max_len = length+1
return (Hstr, max_len)

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# jisyo.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
from cPickle import load
import anydbm,marshal
from zlib import decompress
import os
import calibre.utils.resources as resources
class jisyo (object):
kanwadict = None
itaijidict = None
kanadict = None
jisyo_table = {}
# this class is Borg
_shared_state = {}
def __new__(cls, *p, **k):
self = object.__new__(cls, *p, **k)
self.__dict__ = cls._shared_state
return self
def __init__(self):
if self.kanwadict is None:
dictpath = resources.get_path(os.path.join('localization','pykakasi','kanwadict2.db'))
self.kanwadict = anydbm.open(dictpath,'r')
if self.itaijidict is None:
itaijipath = resources.get_path(os.path.join('localization','pykakasi','itaijidict2.pickle'))
itaiji_pkl = open(itaijipath, 'rb')
self.itaijidict = load(itaiji_pkl)
if self.kanadict is None:
kanadictpath = resources.get_path(os.path.join('localization','pykakasi','kanadict2.pickle'))
kanadict_pkl = open(kanadictpath, 'rb')
self.kanadict = load(kanadict_pkl)
def load_jisyo(self, char):
try:#python2
key = "%04x"%ord(unicode(char))
except:#python3
key = "%04x"%ord(char)
try: #already exist?
table = self.jisyo_table[key]
except:
try:
table = self.jisyo_table[key] = marshal.loads(decompress(self.kanwadict[key]))
except:
return None
return table

View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
# k2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
class K2a (object):
kanwa = None
def __init__(self):
self.kanwa = jisyo()
def isKatakana(self, char):
return ( 0x30a0 < ord(char) and ord(char) < 0x30f7)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(10, len(text)+1)
for x in xrange(r):
if text[:x] in self.kanwa.kanadict:
if max_len < x:
max_len = x
Hstr = self.kanwa.kanadict[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
# kakasi.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.j2h import J2H
from calibre.ebooks.unihandecode.pykakasi.h2a import H2a
from calibre.ebooks.unihandecode.pykakasi.k2a import K2a
class kakasi(object):
j2h = None
h2a = None
k2a = None
def __init__(self):
self.j2h = J2H()
self.h2a = H2a()
self.k2a = K2a()
def do(self, text):
otext = ''
i = 0
while True:
if i >= len(text):
break
if self.j2h.isKanji(text[i]):
(t, l) = self.j2h.convert(text[i:])
if l <= 0:
otext = otext + text[i]
i = i + 1
continue
i = i + l
m = 0
tmptext = ""
while True:
if m >= len(t):
break
(s, n) = self.h2a.convert(t[m:])
if n <= 0:
break
m = m + n
tmptext = tmptext+s
if i >= len(text):
otext = otext + tmptext.capitalize()
else:
otext = otext + tmptext.capitalize() +' '
elif self.h2a.isHiragana(text[i]):
tmptext = ''
while True:
(t, l) = self.h2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.h2a.isHiragana(text[i]):
otext = otext + tmptext + ' '
break
elif self.k2a.isKatakana(text[i]):
tmptext = ''
while True:
(t, l) = self.k2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.k2a.isKatakana(text[i]):
otext = otext + tmptext + ' '
break
else:
otext = otext + text[i]
i += 1
return otext

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,317 @@
;; Kana-Alphabet mapping dictionary
;;
;; To use this mapping table,
;; you should unicode normalize NKFC form.
;;
;; basic mapping
;;
a ァ
a ア
ba バ
bba ッバ
bbe ッベ
bbi ッビ
bbo ッボ
bbu ッブ
bbya ッビャ
bbyo ッビョ
bbyu ッビュ
be ベ
bi ビ
bo ボ
bu ブ
bya ビャ
byo ビョ
byu ビュ
cha チャ
che チェ
chi チ
cho チョ
chu チュ
da ダ
dda ッダ
dde ッデ
ddo ッド
de デ
di ディ
do ド
e ェ
e エ
e ヱ
fa ファ
fe フェ
ffa ッファ
ffe ッフェ
ffi ッフィ
ffo ッフォ
ffu ッフ
fi フィ
fo フォ
fu フ
ga ガ
ge ゲ
gga ッガ
gge ッゲ
ggi ッギ
ggo ッゴ
ggu ッグ
ggya ッギャ
ggyo ッギョ
ggyu ッギュ
gi ギ
go ゴ
gu グ
gya グャ
gyo ギョ
gyu ギゥ
ha ハ
he ヘ
hha ッハ
hhe ッヘ
hhi ッヒ
hho ッホ
hhya ッヒャ
hhyo ッヒョ
hhyu ッヒュ
hi ヒ
ho ホ
hya ヒャ
hyo ヒョ
hyu ヒュ
i ィ
i イ
i ヰ
ja ジャ
ja ヂャ
ji ジ
ji ヂ
jja ッジャ
jji ッジ
jji ッヂ
jjo ッジョ
jju ッジュ
jjya ッヂャ
jjyo ッヂョ
jjyu ッヂュ
jo ジョ
jo ヂョ
ju ジュ
ju ヂュ
ka カ
ka ヵ
ke ケ
ke ヶ
ki キ
kka ッカ
kke ッケ
kki ッキ
kko ッコ
kku ック
kkya ッキャ
kkyo ッキョ
kkyu ッキュ
ko コ
ku ク
kya キァ
kyo キォ
kyu キゥ
ma マ
me メ
mi ミ
mo モ
mu ム
mya ミャ
myo ミョ
myu ミュ
n ン
n'a ンア
n'e ンエ
n'i ンイ
n'o ンオ
n'u ンウ
na ナ
ne ネ
ni ニ
no
nu ヌ
nya ニャ
nyo ニョ
nyu ニュ
o ォ
o オ
pa パ
pe ペ
pi ピ
po ポ
ppa ッパ
ppe ッペ
ppi ッピ
ppo ッポ
ppu ップ
ppya ッピャ
ppyo ッピョ
ppyu ッピュ
pu プ
pya ピャ
pyo ピョ
pyu ピュ
ra ラ
re レ
ri リ
ro ロ
rra ッラ
rre ッレ
rri ッリ
rro ッロ
rru ッル
rrya ッリャ
rryo ッリョ
rryu ッリュ
ru ル
rya リャ
ryo リョ
ryu リュ
sa サ
se セ
sha シャ
shi シ
sho ショ
shu シュ
so ソ
ssa ッサ
sse ッセ
ssha ッシャ
sshi ッシ
ssho ッショ
sshu ッシュ
sso ッソ
ssu ッス
su ス
ta タ
tcha ッチャ
tchi ッチ
tcho ッチョ
tchu ッチュ
te テ
to ト
tsu ッ
tsu ツ
tta ッタ
tte ッテ
tto ット
ttsu ッツ
u ゥ
u ウ
va ヴァ
ve ヴェ
vi ヴィ
vo ヴォ
vu ヴ
vva ッヴァ
vve ッヴェ
vvi ッヴィ
vvo ッヴォ
vvu ッヴ
wa ヮ
wa ワ
wo ヲ
ya ャ
ya ヤ
yo ョ
yo ヨ
yu ュ
yu ユ
yya ッヤ
yyo ッヨ
yyu ッユ
za ザ
ze ゼ
zo ゾ
zu ズ
zu ヅ
zza ッザ
zzo ッゾ
zzu ッズ
zzu ッヅ
;;
;; extended characters
;;
;;
;; gairai terms
;;
all オール
algrism アルゴリズム
answer アンサー
base ベース
begineer ビギナー
connection コネクション
contents コンテンツ
creator クリエーター
comic コミック
comics コミックス
culture カルチャー
debug デバッグ
debugging デバッギング
design デザイン
digital デジタル
dillenma ジレンマ
directory ディレクトリ
disk ディスク
document ドキュメント
download ダウンロード
electric エレクトリック
facebook フェイスブック
firefox ファイアーフォックス
folder フォルダ
format フォーマット
forum フォーラム
fox フォックス
free フリー
gnome ノーム
gnu グヌー
gozilla ゴジラ
guide ガイド
harvard ハーバード
help ヘルプ
highlight ハイライト
japan ジャパン
journal ジャーナル
library ライブラリ
line ライン
love ラヴ
love ラブ
mail メール
main メイン
mystery ミステリ
mozilla モジラ
network ネットワーク
next ネクスト
new ニュー
news ニュース
native ネイティブ
online オンライン
open オープン
professional プロフェッショナル
profile プロファイル
programmer プログラマ
sample サンプル
series シリーズ
share シェア
social ソーシャル
society ソサエティ
software ソフトウエア
source ソース
street ストリート
system システム
tag タグ
text テキスト
thunderbird サンダーバード
training トレーニング
twitter ツイッター
unicode ユニコード
wall ウオール
wall ウォール
welcome ウェルカム
welcome ウエルカム
wikinomics ウィキノミクス
york ヨーク

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,16 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text. Transliterate
unicode characters to ASCII.
Decode unicode text to an ASCII representation of the text in Chinese.
Transliterate unicode characters to ASCII based on chinese pronounce.
derived from John's unidecode library.
Copyright(c) 2009, John Schember
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
is based on the perl module Text::Unidecode
@ -55,29 +59,20 @@ it under the same terms as Perl itself.
'''
import re
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
from calibre.constants import preferred_encoding
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES
class Unidecoder(object):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)
def decode(self, text):
'''
Tranliterate the string from unicode characters to ASCII.
'''
# The keys for CODEPOINTS is unicode characters, we want to be sure the
# input text is unicode.
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
# Replace characters larger than 127 with their ASCII equivelent.
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
text)
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
def replace_point(self, codepoint):
'''
@ -87,7 +82,7 @@ class Unidecoder(object):
# Split the unicode character xABCD into parts 0xAB and 0xCD.
# 0xAB represents the group within CODEPOINTS to query and 0xCD
# represents the position in the list of characters for the group.
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
codepoint)]
except:
return '?'
@ -97,12 +92,18 @@ class Unidecoder(object):
Find what group character is a part of.
'''
# Code groups withing CODEPOINTS take the form 'xAB'
return u'x%02x' % (ord(unicode(character)) >> 8)
try:#python2
return 'x%02x' % (ord(unicode(character)) >> 8)
except:
return 'x%02x' % (ord(character) >> 8)
def grouped_point(self, character):
'''
Return the location the replacement character is in the list for a
the group character is a part of.
'''
return ord(unicode(character)) & 255
try:#python2
return ord(unicode(character)) & 255
except:
return ord(character) & 255

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text in Vietnamese.
'''
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
class Vndecoder(Unidecoder):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)

File diff suppressed because it is too large Load Diff

View File

@ -6,12 +6,12 @@ meaning as possible.
import os
from math import ceil
from calibre.ebooks.unidecode.unidecoder import Unidecoder
from calibre import sanitize_file_name
from calibre.constants import preferred_encoding, iswindows
udc = Unidecoder()
from calibre.utils.localization import get_udc
def ascii_text(orig):
udc = get_udc()
try:
ascii = udc.decode(orig)
except:

View File

@ -169,3 +169,13 @@ def set_qt_translator(translator):
return translator.load(p)
return False
_udc = None
def get_udc():
global _udc
if _udc is None:
from calibre.ebooks.unihandecode import Unihandecoder
_udc = Unihandecoder(lang=get_lang())
return _udc