mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When converting non english texts to english, use the users currect calibre interface language. This allows japanes/korean/vietnamese characters to be correctly converted. Previously they were assumed to be Chinese. Fixes #7622 (Calibre need to switch logic when converting Unicode filename into ASCII)
This commit is contained in:
commit
c4f06e39af
27
COPYRIGHT
27
COPYRIGHT
@ -193,6 +193,33 @@ License: GPL-3
|
||||
The full text of the GPL is distributed as in
|
||||
/usr/share/common-licenses/GPL-3 on Debian systems.
|
||||
|
||||
Files: src/calibre/ebooks/unihandecode/pykakasi/*
|
||||
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
|
||||
Copyright: 1992, Hironobu Takahashi
|
||||
License: GPL-2+
|
||||
The full text of the GPL is distributed as in
|
||||
/usr/share/common-licenses/GPL on Debian systems.
|
||||
|
||||
Files: resources/kanwadict2.db
|
||||
Files: resources/itaijidict2.pickle
|
||||
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
|
||||
Copyright: 1992 1993 1994, Hironobu Takahashi (takahasi@tiny.or.jp),
|
||||
Copyright: 1992 1993 1994, Masahiko Sato (masahiko@sato.riec.tohoku.ac.jp),
|
||||
Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
|
||||
Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
|
||||
License: GPL-2+
|
||||
The full text of the GPL is distributed as in
|
||||
/usr/share/common-licenses/GPL on Debian systems.
|
||||
|
||||
Files: src/calibre/ebooks/unihandecode/*
|
||||
Copyright: 2010-2011, Hiroshi Miura <miurahr@linux.com>
|
||||
Copyright: 2009, John Schember
|
||||
Copyright: 2007, Russell Norris
|
||||
Copyright: 2001, Sean M. Burke
|
||||
License: GPL-3, Perl
|
||||
The full text of the GPL is distributed as in
|
||||
/usr/share/common-licenses/GPL-3 on Debian systems.
|
||||
|
||||
Files: src/encutils/__init__.py
|
||||
Copyright: 2005-2008: Christof Hoeke
|
||||
License: LGPL-3+, CC-BY-3.0
|
||||
|
@ -6,9 +6,10 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, cPickle
|
||||
import os, cPickle, re, anydbm, shutil
|
||||
from zlib import compress
|
||||
|
||||
from setup import Command, basenames
|
||||
from setup import Command, basenames, __appname__
|
||||
|
||||
def get_opts_from_parser(parser):
|
||||
def do_opt(opt):
|
||||
@ -26,6 +27,9 @@ class Resources(Command):
|
||||
|
||||
description = 'Compile various needed calibre resources'
|
||||
|
||||
KAKASI_PATH = os.path.join(Command.SRC, __appname__,
|
||||
'ebooks', 'unihandecode', 'pykakasi')
|
||||
|
||||
def run(self, opts):
|
||||
scripts = {}
|
||||
for x in ('console', 'gui'):
|
||||
@ -101,11 +105,113 @@ class Resources(Command):
|
||||
import json
|
||||
json.dump(function_dict, open(dest, 'wb'), indent=4)
|
||||
|
||||
self.run_kakasi(opts)
|
||||
|
||||
def run_kakasi(self, opts):
|
||||
self.records = {}
|
||||
src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
|
||||
dest = self.j(self.RESOURCES, 'localization',
|
||||
'pykakasi','kanwadict2.db')
|
||||
base = os.path.dirname(dest)
|
||||
if not os.path.exists(base):
|
||||
os.makedirs(base)
|
||||
|
||||
if not self.newer(dest, src):
|
||||
self.info('\tKanwadict is up to date')
|
||||
else:
|
||||
self.info('\tGenerating Kanwadict')
|
||||
|
||||
for line in open(src, "r"):
|
||||
self.parsekdict(line)
|
||||
self.kanwaout(dest)
|
||||
|
||||
src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
|
||||
dest = self.j(self.RESOURCES, 'localization',
|
||||
'pykakasi','itaijidict2.pickle')
|
||||
|
||||
if not self.newer(dest, src):
|
||||
self.info('\tItaijidict is up to date')
|
||||
else:
|
||||
self.info('\tGenerating Itaijidict')
|
||||
self.mkitaiji(src, dest)
|
||||
|
||||
src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
|
||||
dest = self.j(self.RESOURCES, 'localization',
|
||||
'pykakasi','kanadict2.pickle')
|
||||
|
||||
if not self.newer(dest, src):
|
||||
self.info('\tKanadict is up to date')
|
||||
else:
|
||||
self.info('\tGenerating kanadict')
|
||||
self.mkkanadict(src, dest)
|
||||
|
||||
return
|
||||
|
||||
|
||||
def mkitaiji(self, src, dst):
|
||||
dic = {}
|
||||
for line in open(src, "r"):
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.startswith(';;'): # skip comment
|
||||
continue
|
||||
if re.match(r"^$",line):
|
||||
continue
|
||||
pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
|
||||
dic[pair[0]] = pair[1]
|
||||
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
|
||||
|
||||
def mkkanadict(self, src, dst):
|
||||
dic = {}
|
||||
for line in open(src, "r"):
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.startswith(';;'): # skip comment
|
||||
continue
|
||||
if re.match(r"^$",line):
|
||||
continue
|
||||
(alpha, kana) = line.split(' ')
|
||||
dic[kana] = alpha
|
||||
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
|
||||
|
||||
def parsekdict(self, line):
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.startswith(';;'): # skip comment
|
||||
return
|
||||
(yomi, kanji) = line.split(' ')
|
||||
if ord(yomi[-1:]) <= ord('z'):
|
||||
tail = yomi[-1:]
|
||||
yomi = yomi[:-1]
|
||||
else:
|
||||
tail = ''
|
||||
self.updaterec(kanji, yomi, tail)
|
||||
|
||||
def updaterec(self, kanji, yomi, tail):
|
||||
key = "%04x"%ord(kanji[0])
|
||||
if key in self.records:
|
||||
if kanji in self.records[key]:
|
||||
rec = self.records[key][kanji]
|
||||
rec.append((yomi,tail))
|
||||
self.records[key].update( {kanji: rec} )
|
||||
else:
|
||||
self.records[key][kanji]=[(yomi, tail)]
|
||||
else:
|
||||
self.records[key] = {}
|
||||
self.records[key][kanji]=[(yomi, tail)]
|
||||
|
||||
def kanwaout(self, out):
|
||||
dic = anydbm.open(out, 'c')
|
||||
for (k, v) in self.records.iteritems():
|
||||
dic[k] = compress(cPickle.dumps(v, -1))
|
||||
dic.close()
|
||||
|
||||
|
||||
def clean(self):
|
||||
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
|
||||
x = self.j(self.RESOURCES, x+'.pickle')
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
|
||||
if os.path.exists(kakasi):
|
||||
shutil.rmtree(kakasi)
|
||||
|
||||
|
||||
|
||||
|
@ -402,8 +402,8 @@ OptionRecommendation(name='asciiize',
|
||||
'with "Mikhail Gorbachiov". Also, note that in '
|
||||
'cases where there are multiple representations of a character '
|
||||
'(characters shared by Chinese and Japanese for instance) the '
|
||||
'representation used by the largest number of people will be '
|
||||
'used (Chinese in the previous example).')%\
|
||||
'representation based on the current calibre interface language will be '
|
||||
'used.')%\
|
||||
u'\u041c\u0438\u0445\u0430\u0438\u043b '
|
||||
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
|
||||
)
|
||||
|
@ -543,9 +543,9 @@ class HTMLPreProcessor(object):
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if getattr(self.extra_opts, 'asciiize', False):
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
from calibre.utils.localization import get_udc
|
||||
unihandecoder = get_udc()
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
@ -557,10 +557,10 @@ class HTMLPreProcessor(object):
|
||||
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
from calibre.utils.localization import get_udc
|
||||
unihandecoder = get_udc()
|
||||
for char in unsupported_unicode_chars:
|
||||
asciichar = unidecoder.decode(char)
|
||||
asciichar = unihandecoder.decode(char)
|
||||
html = html.replace(char, asciichar)
|
||||
|
||||
return html
|
||||
|
File diff suppressed because it is too large
Load Diff
55
src/calibre/ebooks/unihandecode/__init__.py
Normal file
55
src/calibre/ebooks/unihandecode/__init__.py
Normal file
@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__all__ = ["Unihandecoder"]
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text.
|
||||
Translate unicode characters to ASCII.
|
||||
|
||||
inspired from John's unidecode library.
|
||||
Copyright(c) 2009, John Schember
|
||||
|
||||
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||
|
||||
'''
|
||||
import unicodedata
|
||||
|
||||
class Unihandecoder(object):
|
||||
preferred_encoding = None
|
||||
decoder = None
|
||||
|
||||
def __init__(self, lang="zh", encoding='utf-8'):
|
||||
self.preferred_encoding = encoding
|
||||
lang = lang.lower()
|
||||
if lang[:2] == u'ja':
|
||||
from calibre.ebooks.unihandecode.jadecoder import Jadecoder
|
||||
self.decoder = Jadecoder()
|
||||
elif lang[:2] == u'kr' or lang == u'korean':
|
||||
from calibre.ebooks.unihandecode.krdecoder import Krdecoder
|
||||
self.decoder = Krdecoder()
|
||||
elif lang[:2] == u'vn' or lang == u'vietnum':
|
||||
from calibre.ebooks.unihandecode.vndecoder import Vndecoder
|
||||
self.decoder = Vndecoder()
|
||||
else: #zh and others
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
self.decoder = Unidecoder()
|
||||
|
||||
def decode(self, text):
|
||||
try:
|
||||
unicode # python2
|
||||
if not isinstance(text, unicode):
|
||||
try:
|
||||
text = unicode(text)
|
||||
except:
|
||||
try:
|
||||
text = text.decode(self.preferred_encoding)
|
||||
except:
|
||||
text = text.decode('utf-8', 'replace')
|
||||
except: # python3, str is unicode
|
||||
pass
|
||||
#at first unicode normalize it. (see Unicode standards)
|
||||
ntext = unicodedata.normalize('NFKC', text)
|
||||
return self.decoder.decode(ntext)
|
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
41
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
41
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
@ -0,0 +1,41 @@
|
||||
# coding:utf8
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text for Japanese.
|
||||
Translate unicode string to ASCII roman string.
|
||||
|
||||
API is based on the python unidecode,
|
||||
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
|
||||
and perl module Text::Unidecode
|
||||
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
|
||||
|
||||
This functionality is owned by Kakasi Japanese processing engine.
|
||||
|
||||
Copyright (c) 2010 Hiroshi Miura
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
|
||||
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
|
||||
|
||||
class Jadecoder(Unidecoder):
|
||||
kakasi = None
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(JACODES)
|
||||
self.kakasi = kakasi()
|
||||
|
||||
def decode(self, text):
|
||||
try:
|
||||
result=self.kakasi.do(text)
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
|
||||
except:
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||
|
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
24
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
24
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
@ -0,0 +1,24 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Korean.
|
||||
Based on unidecoder.
|
||||
|
||||
'''
|
||||
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
|
||||
class Krdecoder(Unidecoder):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
5
src/calibre/ebooks/unihandecode/pykakasi/__init__.py
Normal file
5
src/calibre/ebooks/unihandecode/pykakasi/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
|
||||
kakasi
|
||||
|
||||
__all__ = ["pykakasi"]
|
||||
|
185
src/calibre/ebooks/unihandecode/pykakasi/h2a.py
Normal file
185
src/calibre/ebooks/unihandecode/pykakasi/h2a.py
Normal file
@ -0,0 +1,185 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# h2a.py
|
||||
#
|
||||
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
|
||||
#
|
||||
# Original copyright:
|
||||
# * KAKASI (Kanji Kana Simple inversion program)
|
||||
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
|
||||
# * Copyright (C) 1992
|
||||
# * Hironobu Takahashi (takahasi@tiny.or.jp)
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either versions 2, or (at your option)
|
||||
# * any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with KAKASI, see the file COPYING. If not, write to the Free
|
||||
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# * 02111-1307, USA.
|
||||
# */
|
||||
|
||||
class H2a (object):
|
||||
|
||||
H2a_table = {
|
||||
u"\u3041":"a", u"\u3042":"a",
|
||||
u"\u3043":"i", u"\u3044":"i",
|
||||
u"\u3045":"u", u"\u3046":"u",
|
||||
u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va",
|
||||
u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve",
|
||||
u"\u3046\u309b\u3049":"vo",
|
||||
u"\u3047":"e", u"\u3048":"e",
|
||||
u"\u3049":"o", u"\u304a":"o",
|
||||
|
||||
u"\u304b":"ka", u"\u304c":"ga",
|
||||
u"\u304d":"ki", u"\u304d\u3041":"kya",
|
||||
u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo",
|
||||
u"\u304e":"gi", u"\u3050\u3083":"gya",
|
||||
u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo",
|
||||
u"\u304f":"ku", u"\u3050":"gu",
|
||||
u"\u3051":"ke", u"\u3052":"ge",
|
||||
u"\u3053":"ko", u"\u3054":"go",
|
||||
|
||||
u"\u3055":"sa", u"\u3056":"za",
|
||||
u"\u3057":"shi", u"\u3057\u3083":"sha",
|
||||
u"\u3057\u3085":"shu", u"\u3057\u3087":"sho",
|
||||
u"\u3058":"ji", u"\u3058\u3083":"ja",
|
||||
u"\u3058\u3085":"ju", u"\u3058\u3087":"jo",
|
||||
u"\u3059":"su", u"\u305a":"zu",
|
||||
u"\u305b":"se", u"\u305c":"ze",
|
||||
u"\u305d":"so", u"\u305e":"zo",
|
||||
|
||||
u"\u305f":"ta", u"\u3060":"da",
|
||||
u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha",
|
||||
u"\u3061\u3085":"chu", u"\u3061\u3087":"cho",
|
||||
u"\u3062":"ji", u"\u3062\u3083":"ja",
|
||||
u"\u3062\u3085":"ju", u"\u3062\u3087":"jo",
|
||||
|
||||
u"\u3063":"tsu",
|
||||
u"\u3063\u3046\u309b":"vvu",
|
||||
u"\u3063\u3046\u309b\u3041":"vva",
|
||||
u"\u3063\u3046\u309b\u3043":"vvi",
|
||||
u"\u3063\u3046\u309b\u3047":"vve",
|
||||
u"\u3063\u3046\u309b\u3049":"vvo",
|
||||
u"\u3063\u304b":"kka", u"\u3063\u304c":"gga",
|
||||
u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya",
|
||||
u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo",
|
||||
u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya",
|
||||
u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo",
|
||||
u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu",
|
||||
u"\u3063\u3051":"kke", u"\u3063\u3052":"gge",
|
||||
u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo",
|
||||
u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza",
|
||||
u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha",
|
||||
u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho",
|
||||
u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja",
|
||||
u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo",
|
||||
u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu",
|
||||
u"\u3063\u305b":"sse", u"\u3063\u305e":"zze",
|
||||
u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo",
|
||||
u"\u3063\u305f":"tta", u"\u3063\u3060":"dda",
|
||||
u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha",
|
||||
u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho",
|
||||
u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya",
|
||||
u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo",
|
||||
u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu",
|
||||
u"\u3063\u3066":"tte", u"\u3063\u3067":"dde",
|
||||
u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo",
|
||||
u"\u3063\u306f":"hha", u"\u3063\u3070":"bba",
|
||||
u"\u3063\u3071":"ppa",
|
||||
u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya",
|
||||
u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo",
|
||||
u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya",
|
||||
u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo",
|
||||
u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya",
|
||||
u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo",
|
||||
u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa",
|
||||
u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe",
|
||||
u"\u3063\u3075\u3049":"ffo",
|
||||
u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu",
|
||||
u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe",
|
||||
u"\u3063\u307a":"ppe",
|
||||
u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo",
|
||||
u"\u3063\u307d":"ppo",
|
||||
u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu",
|
||||
u"\u3063\u3088":"yyo",
|
||||
u"\u3063\u3089":"rra", u"\u3063\u308a":"rri",
|
||||
u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu",
|
||||
u"\u3063\u308a\u3087":"rryo",
|
||||
u"\u3063\u308b":"rru", u"\u3063\u308c":"rre",
|
||||
u"\u3063\u308d":"rro",
|
||||
|
||||
u"\u3064":"tsu", u"\u3065":"zu",
|
||||
u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di",
|
||||
u"\u3068":"to", u"\u3069":"do",
|
||||
|
||||
u"\u306a":"na",
|
||||
u"\u306b":"ni", u"\u306b\u3083":"nya",
|
||||
u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo",
|
||||
u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no",
|
||||
|
||||
u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa",
|
||||
u"\u3072":"hi", u"\u3072\u3083":"hya",
|
||||
u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo",
|
||||
u"\u3073":"bi", u"\u3073\u3083":"bya",
|
||||
u"\u3073\u3085":"byu", u"\u3073\u3087":"byo",
|
||||
u"\u3074":"pi", u"\u3074\u3083":"pya",
|
||||
u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo",
|
||||
u"\u3075":"fu", u"\u3075\u3041":"fa",
|
||||
u"\u3075\u3043":"fi", u"\u3075\u3047":"fe",
|
||||
u"\u3075\u3049":"fo",
|
||||
u"\u3076":"bu", u"\u3077":"pu",
|
||||
u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe",
|
||||
u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po",
|
||||
|
||||
u"\u307e":"ma",
|
||||
u"\u307f":"mi", u"\u307f\u3083":"mya",
|
||||
u"\u307f\u3085":"myu", u"\u307f\u3087":"myo",
|
||||
u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo",
|
||||
|
||||
u"\u3083":"ya", u"\u3084":"ya",
|
||||
u"\u3085":"yu", u"\u3086":"yu",
|
||||
u"\u3087":"yo", u"\u3088":"yo",
|
||||
|
||||
u"\u3089":"ra",
|
||||
u"\u308a":"ri", u"\u308a\u3083":"rya",
|
||||
u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo",
|
||||
u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro",
|
||||
|
||||
u"\u308e":"wa", u"\u308f":"wa",
|
||||
u"\u3090":"i", u"\u3091":"e",
|
||||
u"\u3092":"wo", u"\u3093":"n",
|
||||
|
||||
u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i",
|
||||
u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e",
|
||||
u"\u3093\u304a":"n'o",
|
||||
}
|
||||
|
||||
# this class is Borg
|
||||
_shared_state = {}
|
||||
|
||||
def __new__(cls, *p, **k):
|
||||
self = object.__new__(cls, *p, **k)
|
||||
self.__dict__ = cls._shared_state
|
||||
return self
|
||||
|
||||
def isHiragana(self, char):
|
||||
return ( 0x3040 < ord(char) and ord(char) < 0x3094)
|
||||
|
||||
def convert(self, text):
|
||||
Hstr = ""
|
||||
max_len = -1
|
||||
r = min(4, len(text)+1)
|
||||
for x in xrange(r):
|
||||
if text[:x] in self.H2a_table:
|
||||
if max_len < x:
|
||||
max_len = x
|
||||
Hstr = self.H2a_table[text[:x]]
|
||||
return (Hstr, max_len)
|
||||
|
564
src/calibre/ebooks/unihandecode/pykakasi/itaijidict.utf8
Normal file
564
src/calibre/ebooks/unihandecode/pykakasi/itaijidict.utf8
Normal file
@ -0,0 +1,564 @@
|
||||
芦蘆
|
||||
壱一
|
||||
苅刈
|
||||
舘館
|
||||
曽曾
|
||||
菟兎
|
||||
島嶋
|
||||
盃杯
|
||||
冨富
|
||||
峯峰
|
||||
亘亙
|
||||
弌一
|
||||
乘乗
|
||||
亂乱
|
||||
豫予
|
||||
亊事
|
||||
弍二
|
||||
亞亜
|
||||
亰京
|
||||
从従
|
||||
仭仞
|
||||
佛仏
|
||||
來来
|
||||
儘侭
|
||||
伜倅
|
||||
假仮
|
||||
會会
|
||||
做作
|
||||
傳伝
|
||||
僞偽
|
||||
價価
|
||||
儉倹
|
||||
兒児
|
||||
兔兎
|
||||
竸競
|
||||
兩両
|
||||
囘回
|
||||
册冊
|
||||
冢塚
|
||||
冩写
|
||||
决決
|
||||
冱冴
|
||||
冰氷
|
||||
况況
|
||||
凉涼
|
||||
處処
|
||||
凾函
|
||||
刄刃
|
||||
刔抉
|
||||
刧劫
|
||||
剩剰
|
||||
劍剣
|
||||
劔剣
|
||||
劒剣
|
||||
剱剣
|
||||
劑剤
|
||||
辨弁
|
||||
勞労
|
||||
勳勲
|
||||
勵励
|
||||
勸勧
|
||||
區区
|
||||
卆卒
|
||||
丗世
|
||||
凖準
|
||||
夘卯
|
||||
卻却
|
||||
卷巻
|
||||
厠廁
|
||||
厦廈
|
||||
厮廝
|
||||
厰廠
|
||||
參参
|
||||
雙双
|
||||
咒呪
|
||||
單単
|
||||
噐器
|
||||
營営
|
||||
嚏嚔
|
||||
嚴厳
|
||||
囑嘱
|
||||
囓齧
|
||||
圀国
|
||||
圈圏
|
||||
國国
|
||||
圍囲
|
||||
圓円
|
||||
團団
|
||||
圖図
|
||||
埀垂
|
||||
埓埒
|
||||
塲場
|
||||
壞壊
|
||||
墮堕
|
||||
壓圧
|
||||
壘塁
|
||||
壥廛
|
||||
壤壌
|
||||
壯壮
|
||||
壺壷
|
||||
壹一
|
||||
壻婿
|
||||
壽寿
|
||||
夂夊
|
||||
夛多
|
||||
梦夢
|
||||
竒奇
|
||||
奧奥
|
||||
奬奨
|
||||
侫佞
|
||||
姙妊
|
||||
嫻嫺
|
||||
孃嬢
|
||||
學学
|
||||
斈学
|
||||
寃冤
|
||||
寇冦
|
||||
寢寝
|
||||
寫写
|
||||
寶宝
|
||||
寳宝
|
||||
尅剋
|
||||
將将
|
||||
專専
|
||||
對対
|
||||
尓爾
|
||||
尢尤
|
||||
屆届
|
||||
屬属
|
||||
峽峡
|
||||
嶌嶋
|
||||
嵜崎
|
||||
崙崘
|
||||
嵳嵯
|
||||
嶽岳
|
||||
巛川
|
||||
巵卮
|
||||
帋紙
|
||||
帶帯
|
||||
幤幣
|
||||
廐厩
|
||||
廏厩
|
||||
廣広
|
||||
廚厨
|
||||
廢廃
|
||||
廳庁
|
||||
廰庁
|
||||
廸迪
|
||||
弃棄
|
||||
弉奘
|
||||
彜彝
|
||||
彈弾
|
||||
彌弥
|
||||
弯彎
|
||||
徃往
|
||||
徑径
|
||||
從従
|
||||
徠来
|
||||
悳徳
|
||||
恠怪
|
||||
恆恒
|
||||
悧俐
|
||||
惡悪
|
||||
惠恵
|
||||
忰悴
|
||||
惱悩
|
||||
愼慎
|
||||
愽博
|
||||
慘惨
|
||||
慚慙
|
||||
憇憩
|
||||
應応
|
||||
懷懐
|
||||
懴懺
|
||||
戀恋
|
||||
戞戛
|
||||
戰戦
|
||||
戲戯
|
||||
拔抜
|
||||
拏拿
|
||||
擔担
|
||||
拜拝
|
||||
拂払
|
||||
挾挟
|
||||
搜捜
|
||||
插挿
|
||||
搖揺
|
||||
攝摂
|
||||
攪撹
|
||||
據拠
|
||||
擇択
|
||||
擧拳
|
||||
舉拳
|
||||
抬擡
|
||||
擴拡
|
||||
攜携
|
||||
攵攴
|
||||
攷考
|
||||
收収
|
||||
效効
|
||||
敕勅
|
||||
敍叙
|
||||
敘叙
|
||||
數数
|
||||
變変
|
||||
斷断
|
||||
旙旛
|
||||
昜陽
|
||||
晄晃
|
||||
晉晋
|
||||
晝昼
|
||||
晰晢
|
||||
暎映
|
||||
曉暁
|
||||
暸瞭
|
||||
昿曠
|
||||
曵曳
|
||||
朖朗
|
||||
朞期
|
||||
霸覇
|
||||
杤栃
|
||||
杰傑
|
||||
枩松
|
||||
檜桧
|
||||
條条
|
||||
檮梼
|
||||
梹檳
|
||||
棊棋
|
||||
棧桟
|
||||
棕椶
|
||||
楙茂
|
||||
榮栄
|
||||
槨椁
|
||||
樂楽
|
||||
權権
|
||||
樞枢
|
||||
樣様
|
||||
樓楼
|
||||
橢楕
|
||||
檢検
|
||||
櫻桜
|
||||
鬱欝
|
||||
盜盗
|
||||
飮飲
|
||||
歐嘔
|
||||
歡歓
|
||||
歸帰
|
||||
殘残
|
||||
殱殲
|
||||
殼殻
|
||||
毆殴
|
||||
毓育
|
||||
氣気
|
||||
沒没
|
||||
泪涙
|
||||
濤涛
|
||||
渕淵
|
||||
渊淵
|
||||
淨浄
|
||||
淺浅
|
||||
滿満
|
||||
溂剌
|
||||
溪渓
|
||||
灌潅
|
||||
滯滞
|
||||
澁渋
|
||||
澀渋
|
||||
潛潜
|
||||
濳潜
|
||||
澂澄
|
||||
澑溜
|
||||
澤沢
|
||||
濟済
|
||||
濕湿
|
||||
濱浜
|
||||
濾滬
|
||||
灣湾
|
||||
烱炯
|
||||
烟煙
|
||||
熈煕
|
||||
熏燻
|
||||
燒焼
|
||||
爐炉
|
||||
爭争
|
||||
爲為
|
||||
爼俎
|
||||
犁犂
|
||||
犹猶
|
||||
犲豺
|
||||
狹狭
|
||||
獎奨
|
||||
默黙
|
||||
獨独
|
||||
獸獣
|
||||
獵猟
|
||||
獻献
|
||||
珎珍
|
||||
璢瑠
|
||||
瑯琅
|
||||
珱瓔
|
||||
瓣弁
|
||||
甞嘗
|
||||
甼町
|
||||
畄留
|
||||
畍界
|
||||
畊耕
|
||||
畆畝
|
||||
畧略
|
||||
畫画
|
||||
當当
|
||||
畴疇
|
||||
疊畳
|
||||
疉畳
|
||||
疂畳
|
||||
癡痴
|
||||
發発
|
||||
皃猊
|
||||
皈帰
|
||||
皹皸
|
||||
盖蓋
|
||||
盡尽
|
||||
蘯盪
|
||||
眞真
|
||||
眦眥
|
||||
礦鉱
|
||||
礪砺
|
||||
碎砕
|
||||
碯瑙
|
||||
祕秘
|
||||
祿禄
|
||||
齋斎
|
||||
禪禅
|
||||
禮礼
|
||||
禀稟
|
||||
稱称
|
||||
稻稲
|
||||
稾稿
|
||||
穗穂
|
||||
穩穏
|
||||
龝穐
|
||||
穰穣
|
||||
窗窓
|
||||
竈竃
|
||||
窰窯
|
||||
竊窃
|
||||
竝並
|
||||
筺筐
|
||||
笋筍
|
||||
箟箘
|
||||
筝箏
|
||||
簔蓑
|
||||
籠篭
|
||||
籘籐
|
||||
籖籤
|
||||
粹粋
|
||||
糺糾
|
||||
絲糸
|
||||
經経
|
||||
總総
|
||||
緜綿
|
||||
縣県
|
||||
縱縦
|
||||
繪絵
|
||||
繩縄
|
||||
繼継
|
||||
緕纃
|
||||
續続
|
||||
纖繊
|
||||
纎繊
|
||||
纜繿
|
||||
缺欠
|
||||
罐缶
|
||||
罸罰
|
||||
羃冪
|
||||
羣群
|
||||
羮羹
|
||||
譱善
|
||||
翆翠
|
||||
翦剪
|
||||
耻恥
|
||||
聟婿
|
||||
聨聯
|
||||
聲声
|
||||
聰聡
|
||||
聽聴
|
||||
肅粛
|
||||
冐冒
|
||||
脉脈
|
||||
腦脳
|
||||
腟膣
|
||||
膓腸
|
||||
膸髄
|
||||
膽胆
|
||||
臈臘
|
||||
臟臓
|
||||
臺台
|
||||
與与
|
||||
舊旧
|
||||
舍舎
|
||||
舖舗
|
||||
舩船
|
||||
艢檣
|
||||
舮艫
|
||||
艷艶
|
||||
莖茎
|
||||
莊荘
|
||||
莵兎
|
||||
菷帚
|
||||
萠萌
|
||||
蕚萼
|
||||
蒂蔕
|
||||
萬万
|
||||
葢蓋
|
||||
蘂蕊
|
||||
蕋蕊
|
||||
藪薮
|
||||
藏蔵
|
||||
藝芸
|
||||
藥薬
|
||||
蘓蘇
|
||||
乕虎
|
||||
號号
|
||||
蠣蛎
|
||||
蝨虱
|
||||
蠅蝿
|
||||
螢蛍
|
||||
蟆蟇
|
||||
蟲虫
|
||||
蠏蟹
|
||||
蟷螳
|
||||
蟒蠎
|
||||
蠶蚕
|
||||
蠧蠹
|
||||
蠻蛮
|
||||
衂衄
|
||||
衞衛
|
||||
袵衽
|
||||
裝装
|
||||
襃褒
|
||||
褝襌
|
||||
覩睹
|
||||
覺覚
|
||||
覽覧
|
||||
觀観
|
||||
觧解
|
||||
觸触
|
||||
誡戒
|
||||
謌歌
|
||||
諡謚
|
||||
謠謡
|
||||
證証
|
||||
譛譖
|
||||
譯訳
|
||||
譽誉
|
||||
讀読
|
||||
讓譲
|
||||
讚賛
|
||||
豐豊
|
||||
貉狢
|
||||
貍狸
|
||||
貎猊
|
||||
豼貔
|
||||
貘獏
|
||||
戝財
|
||||
貭質
|
||||
貳弐
|
||||
貮弐
|
||||
賤賎
|
||||
賣売
|
||||
贊賛
|
||||
賍贓
|
||||
赱走
|
||||
踈疎
|
||||
踴踊
|
||||
躰体
|
||||
軆体
|
||||
軈軅
|
||||
軣轟
|
||||
輕軽
|
||||
輙輒
|
||||
輌輛
|
||||
轉転
|
||||
辭辞
|
||||
辯弁
|
||||
迯逃
|
||||
逹達
|
||||
逎遒
|
||||
遞逓
|
||||
遲遅
|
||||
邊辺
|
||||
邉辺
|
||||
邨村
|
||||
鄰隣
|
||||
醉酔
|
||||
醫医
|
||||
釀醸
|
||||
釋釈
|
||||
釡釜
|
||||
釼剣
|
||||
銕鉄
|
||||
錢銭
|
||||
鎭鎮
|
||||
鐵鉄
|
||||
鐡鉄
|
||||
鑒鑑
|
||||
鑄鋳
|
||||
鑛鉱
|
||||
鈩鑪
|
||||
鑚鑽
|
||||
閇閉
|
||||
濶闊
|
||||
關関
|
||||
阯址
|
||||
陷陥
|
||||
險険
|
||||
隱隠
|
||||
隸隷
|
||||
襍雑
|
||||
雜雑
|
||||
靈霊
|
||||
靜静
|
||||
靱靭
|
||||
韭韮
|
||||
韲齏
|
||||
韵韻
|
||||
顏顔
|
||||
顯顕
|
||||
飃飄
|
||||
餘余
|
||||
餝飾
|
||||
餠餅
|
||||
騷騒
|
||||
驅駆
|
||||
驛駅
|
||||
驗験
|
||||
髓髄
|
||||
體体
|
||||
髮髪
|
||||
鬪闘
|
||||
鰺鯵
|
||||
鰛鰮
|
||||
鳬鳧
|
||||
鳫鴈
|
||||
鵄鴟
|
||||
鵞鵝
|
||||
鷄鶏
|
||||
鷏鷆
|
||||
鹽塩
|
||||
麥麦
|
||||
麸麩
|
||||
麪麺
|
||||
點点
|
||||
黨党
|
||||
皷鼓
|
||||
鼡鼠
|
||||
齊斉
|
||||
齒歯
|
||||
齡齢
|
||||
龜亀
|
||||
槇槙
|
||||
遙遥
|
||||
瑤瑶
|
||||
凜凛
|
||||
熙煕
|
83
src/calibre/ebooks/unihandecode/pykakasi/j2h.py
Normal file
83
src/calibre/ebooks/unihandecode/pykakasi/j2h.py
Normal file
@ -0,0 +1,83 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# j2h.py
|
||||
#
|
||||
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
|
||||
#
|
||||
# Original Copyright:
|
||||
# * KAKASI (Kanji Kana Simple inversion program)
|
||||
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
|
||||
# * Copyright (C) 1992
|
||||
# * Hironobu Takahashi (takahasi@tiny.or.jp)
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either versions 2, or (at your option)
|
||||
# * any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with KAKASI, see the file COPYING. If not, write to the Free
|
||||
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# * 02111-1307, USA.
|
||||
# */
|
||||
|
||||
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
|
||||
import re
|
||||
|
||||
class J2H (object):
|
||||
|
||||
kanwa = None
|
||||
|
||||
cl_table = [
|
||||
"","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow",
|
||||
"aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k",
|
||||
"g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc",
|
||||
"d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n",
|
||||
"n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b",
|
||||
"p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl",
|
||||
"rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k",
|
||||
"k", "", "", "", "", "", "", "", "", ""]
|
||||
|
||||
def __init__(self):
|
||||
self.kanwa = jisyo()
|
||||
|
||||
def isKanji(self, c):
|
||||
return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e)
|
||||
|
||||
def isCletter(self, l, c):
|
||||
if (ord(u"ぁ") <= ord(c) and ord(c) <= 0x309f) and ( l in self.cl_table[ord(c) - ord(u"ぁ")-1]):
|
||||
return True
|
||||
return False
|
||||
|
||||
def itaiji_conv(self, text):
|
||||
r = []
|
||||
for c in text:
|
||||
if c in self.kanwa.itaijidict:
|
||||
r.append(c)
|
||||
for c in r:
|
||||
text = re.sub(c, self.kanwa.itaijidict[c], text)
|
||||
return text
|
||||
|
||||
def convert(self, text):
|
||||
max_len = 0
|
||||
Hstr = ""
|
||||
table = self.kanwa.load_jisyo(text[0])
|
||||
if table is None:
|
||||
return ("", 0)
|
||||
for (k,v) in table.iteritems():
|
||||
length = len(k)
|
||||
if len(text) >= length:
|
||||
if text.startswith(k):
|
||||
for (yomi, tail) in v:
|
||||
if tail is '':
|
||||
if max_len < length:
|
||||
Hstr = yomi
|
||||
max_len = length
|
||||
elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]):
|
||||
Hstr=''.join([yomi,text[length]])
|
||||
max_len = length+1
|
||||
return (Hstr, max_len)
|
53
src/calibre/ebooks/unihandecode/pykakasi/jisyo.py
Normal file
53
src/calibre/ebooks/unihandecode/pykakasi/jisyo.py
Normal file
@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# jisyo.py
|
||||
#
|
||||
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
|
||||
from cPickle import load
|
||||
import anydbm,marshal
|
||||
from zlib import decompress
|
||||
import os
|
||||
|
||||
import calibre.utils.resources as resources
|
||||
|
||||
class jisyo (object):
|
||||
kanwadict = None
|
||||
itaijidict = None
|
||||
kanadict = None
|
||||
jisyo_table = {}
|
||||
|
||||
# this class is Borg
|
||||
_shared_state = {}
|
||||
|
||||
def __new__(cls, *p, **k):
|
||||
self = object.__new__(cls, *p, **k)
|
||||
self.__dict__ = cls._shared_state
|
||||
return self
|
||||
|
||||
def __init__(self):
|
||||
if self.kanwadict is None:
|
||||
dictpath = resources.get_path(os.path.join('localization','pykakasi','kanwadict2.db'))
|
||||
self.kanwadict = anydbm.open(dictpath,'r')
|
||||
if self.itaijidict is None:
|
||||
itaijipath = resources.get_path(os.path.join('localization','pykakasi','itaijidict2.pickle'))
|
||||
itaiji_pkl = open(itaijipath, 'rb')
|
||||
self.itaijidict = load(itaiji_pkl)
|
||||
if self.kanadict is None:
|
||||
kanadictpath = resources.get_path(os.path.join('localization','pykakasi','kanadict2.pickle'))
|
||||
kanadict_pkl = open(kanadictpath, 'rb')
|
||||
self.kanadict = load(kanadict_pkl)
|
||||
|
||||
def load_jisyo(self, char):
|
||||
try:#python2
|
||||
key = "%04x"%ord(unicode(char))
|
||||
except:#python3
|
||||
key = "%04x"%ord(char)
|
||||
|
||||
try: #already exist?
|
||||
table = self.jisyo_table[key]
|
||||
except:
|
||||
try:
|
||||
table = self.jisyo_table[key] = marshal.loads(decompress(self.kanwadict[key]))
|
||||
except:
|
||||
return None
|
||||
return table
|
||||
|
50
src/calibre/ebooks/unihandecode/pykakasi/k2a.py
Normal file
50
src/calibre/ebooks/unihandecode/pykakasi/k2a.py
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# k2a.py
|
||||
#
|
||||
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
|
||||
#
|
||||
# Original copyright:
|
||||
# * KAKASI (Kanji Kana Simple inversion program)
|
||||
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
|
||||
# * Copyright (C) 1992
|
||||
# * Hironobu Takahashi (takahasi@tiny.or.jp)
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either versions 2, or (at your option)
|
||||
# * any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with KAKASI, see the file COPYING. If not, write to the Free
|
||||
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# * 02111-1307, USA.
|
||||
# */
|
||||
|
||||
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
|
||||
|
||||
class K2a (object):
|
||||
|
||||
kanwa = None
|
||||
|
||||
def __init__(self):
|
||||
self.kanwa = jisyo()
|
||||
|
||||
def isKatakana(self, char):
|
||||
return ( 0x30a0 < ord(char) and ord(char) < 0x30f7)
|
||||
|
||||
def convert(self, text):
|
||||
Hstr = ""
|
||||
max_len = -1
|
||||
r = min(10, len(text)+1)
|
||||
for x in xrange(r):
|
||||
if text[:x] in self.kanwa.kanadict:
|
||||
if max_len < x:
|
||||
max_len = x
|
||||
Hstr = self.kanwa.kanadict[text[:x]]
|
||||
return (Hstr, max_len)
|
||||
|
101
src/calibre/ebooks/unihandecode/pykakasi/kakasi.py
Normal file
101
src/calibre/ebooks/unihandecode/pykakasi/kakasi.py
Normal file
@ -0,0 +1,101 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# kakasi.py
|
||||
#
|
||||
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
|
||||
#
|
||||
# Original Copyright:
|
||||
# * KAKASI (Kanji Kana Simple inversion program)
|
||||
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
|
||||
# * Copyright (C) 1992
|
||||
# * Hironobu Takahashi (takahasi@tiny.or.jp)
|
||||
# *
|
||||
# * This program is free software; you can redistribute it and/or modify
|
||||
# * it under the terms of the GNU General Public License as published by
|
||||
# * the Free Software Foundation; either versions 2, or (at your option)
|
||||
# * any later version.
|
||||
# *
|
||||
# * This program is distributed in the hope that it will be useful
|
||||
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# * GNU General Public License for more details.
|
||||
# *
|
||||
# * You should have received a copy of the GNU General Public License
|
||||
# * along with KAKASI, see the file COPYING. If not, write to the Free
|
||||
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# * 02111-1307, USA.
|
||||
# */
|
||||
|
||||
from calibre.ebooks.unihandecode.pykakasi.j2h import J2H
|
||||
from calibre.ebooks.unihandecode.pykakasi.h2a import H2a
|
||||
from calibre.ebooks.unihandecode.pykakasi.k2a import K2a
|
||||
|
||||
class kakasi(object):
|
||||
|
||||
j2h = None
|
||||
h2a = None
|
||||
k2a = None
|
||||
|
||||
def __init__(self):
|
||||
self.j2h = J2H()
|
||||
self.h2a = H2a()
|
||||
self.k2a = K2a()
|
||||
|
||||
|
||||
def do(self, text):
|
||||
otext = ''
|
||||
i = 0
|
||||
while True:
|
||||
if i >= len(text):
|
||||
break
|
||||
|
||||
if self.j2h.isKanji(text[i]):
|
||||
(t, l) = self.j2h.convert(text[i:])
|
||||
if l <= 0:
|
||||
otext = otext + text[i]
|
||||
i = i + 1
|
||||
continue
|
||||
i = i + l
|
||||
m = 0
|
||||
tmptext = ""
|
||||
while True:
|
||||
if m >= len(t):
|
||||
break
|
||||
(s, n) = self.h2a.convert(t[m:])
|
||||
if n <= 0:
|
||||
break
|
||||
m = m + n
|
||||
tmptext = tmptext+s
|
||||
if i >= len(text):
|
||||
otext = otext + tmptext.capitalize()
|
||||
else:
|
||||
otext = otext + tmptext.capitalize() +' '
|
||||
elif self.h2a.isHiragana(text[i]):
|
||||
tmptext = ''
|
||||
while True:
|
||||
(t, l) = self.h2a.convert(text[i:])
|
||||
tmptext = tmptext+t
|
||||
i = i + l
|
||||
if i >= len(text):
|
||||
otext = otext + tmptext
|
||||
break
|
||||
elif not self.h2a.isHiragana(text[i]):
|
||||
otext = otext + tmptext + ' '
|
||||
break
|
||||
elif self.k2a.isKatakana(text[i]):
|
||||
tmptext = ''
|
||||
while True:
|
||||
(t, l) = self.k2a.convert(text[i:])
|
||||
tmptext = tmptext+t
|
||||
i = i + l
|
||||
if i >= len(text):
|
||||
otext = otext + tmptext
|
||||
break
|
||||
elif not self.k2a.isKatakana(text[i]):
|
||||
otext = otext + tmptext + ' '
|
||||
break
|
||||
else:
|
||||
otext = otext + text[i]
|
||||
i += 1
|
||||
|
||||
return otext
|
||||
|
121826
src/calibre/ebooks/unihandecode/pykakasi/kakasidict.utf8
Normal file
121826
src/calibre/ebooks/unihandecode/pykakasi/kakasidict.utf8
Normal file
File diff suppressed because it is too large
Load Diff
317
src/calibre/ebooks/unihandecode/pykakasi/kanadict.utf8
Normal file
317
src/calibre/ebooks/unihandecode/pykakasi/kanadict.utf8
Normal file
@ -0,0 +1,317 @@
|
||||
;; Kana-Alphabet mapping dictionary
|
||||
;;
|
||||
;; To use this mapping table,
|
||||
;; you should unicode normalize NKFC form.
|
||||
;;
|
||||
;; basic mapping
|
||||
;;
|
||||
a ァ
|
||||
a ア
|
||||
ba バ
|
||||
bba ッバ
|
||||
bbe ッベ
|
||||
bbi ッビ
|
||||
bbo ッボ
|
||||
bbu ッブ
|
||||
bbya ッビャ
|
||||
bbyo ッビョ
|
||||
bbyu ッビュ
|
||||
be ベ
|
||||
bi ビ
|
||||
bo ボ
|
||||
bu ブ
|
||||
bya ビャ
|
||||
byo ビョ
|
||||
byu ビュ
|
||||
cha チャ
|
||||
che チェ
|
||||
chi チ
|
||||
cho チョ
|
||||
chu チュ
|
||||
da ダ
|
||||
dda ッダ
|
||||
dde ッデ
|
||||
ddo ッド
|
||||
de デ
|
||||
di ディ
|
||||
do ド
|
||||
e ェ
|
||||
e エ
|
||||
e ヱ
|
||||
fa ファ
|
||||
fe フェ
|
||||
ffa ッファ
|
||||
ffe ッフェ
|
||||
ffi ッフィ
|
||||
ffo ッフォ
|
||||
ffu ッフ
|
||||
fi フィ
|
||||
fo フォ
|
||||
fu フ
|
||||
ga ガ
|
||||
ge ゲ
|
||||
gga ッガ
|
||||
gge ッゲ
|
||||
ggi ッギ
|
||||
ggo ッゴ
|
||||
ggu ッグ
|
||||
ggya ッギャ
|
||||
ggyo ッギョ
|
||||
ggyu ッギュ
|
||||
gi ギ
|
||||
go ゴ
|
||||
gu グ
|
||||
gya グャ
|
||||
gyo ギョ
|
||||
gyu ギゥ
|
||||
ha ハ
|
||||
he ヘ
|
||||
hha ッハ
|
||||
hhe ッヘ
|
||||
hhi ッヒ
|
||||
hho ッホ
|
||||
hhya ッヒャ
|
||||
hhyo ッヒョ
|
||||
hhyu ッヒュ
|
||||
hi ヒ
|
||||
ho ホ
|
||||
hya ヒャ
|
||||
hyo ヒョ
|
||||
hyu ヒュ
|
||||
i ィ
|
||||
i イ
|
||||
i ヰ
|
||||
ja ジャ
|
||||
ja ヂャ
|
||||
ji ジ
|
||||
ji ヂ
|
||||
jja ッジャ
|
||||
jji ッジ
|
||||
jji ッヂ
|
||||
jjo ッジョ
|
||||
jju ッジュ
|
||||
jjya ッヂャ
|
||||
jjyo ッヂョ
|
||||
jjyu ッヂュ
|
||||
jo ジョ
|
||||
jo ヂョ
|
||||
ju ジュ
|
||||
ju ヂュ
|
||||
ka カ
|
||||
ka ヵ
|
||||
ke ケ
|
||||
ke ヶ
|
||||
ki キ
|
||||
kka ッカ
|
||||
kke ッケ
|
||||
kki ッキ
|
||||
kko ッコ
|
||||
kku ック
|
||||
kkya ッキャ
|
||||
kkyo ッキョ
|
||||
kkyu ッキュ
|
||||
ko コ
|
||||
ku ク
|
||||
kya キァ
|
||||
kyo キォ
|
||||
kyu キゥ
|
||||
ma マ
|
||||
me メ
|
||||
mi ミ
|
||||
mo モ
|
||||
mu ム
|
||||
mya ミャ
|
||||
myo ミョ
|
||||
myu ミュ
|
||||
n ン
|
||||
n'a ンア
|
||||
n'e ンエ
|
||||
n'i ンイ
|
||||
n'o ンオ
|
||||
n'u ンウ
|
||||
na ナ
|
||||
ne ネ
|
||||
ni ニ
|
||||
no ノ
|
||||
nu ヌ
|
||||
nya ニャ
|
||||
nyo ニョ
|
||||
nyu ニュ
|
||||
o ォ
|
||||
o オ
|
||||
pa パ
|
||||
pe ペ
|
||||
pi ピ
|
||||
po ポ
|
||||
ppa ッパ
|
||||
ppe ッペ
|
||||
ppi ッピ
|
||||
ppo ッポ
|
||||
ppu ップ
|
||||
ppya ッピャ
|
||||
ppyo ッピョ
|
||||
ppyu ッピュ
|
||||
pu プ
|
||||
pya ピャ
|
||||
pyo ピョ
|
||||
pyu ピュ
|
||||
ra ラ
|
||||
re レ
|
||||
ri リ
|
||||
ro ロ
|
||||
rra ッラ
|
||||
rre ッレ
|
||||
rri ッリ
|
||||
rro ッロ
|
||||
rru ッル
|
||||
rrya ッリャ
|
||||
rryo ッリョ
|
||||
rryu ッリュ
|
||||
ru ル
|
||||
rya リャ
|
||||
ryo リョ
|
||||
ryu リュ
|
||||
sa サ
|
||||
se セ
|
||||
sha シャ
|
||||
shi シ
|
||||
sho ショ
|
||||
shu シュ
|
||||
so ソ
|
||||
ssa ッサ
|
||||
sse ッセ
|
||||
ssha ッシャ
|
||||
sshi ッシ
|
||||
ssho ッショ
|
||||
sshu ッシュ
|
||||
sso ッソ
|
||||
ssu ッス
|
||||
su ス
|
||||
ta タ
|
||||
tcha ッチャ
|
||||
tchi ッチ
|
||||
tcho ッチョ
|
||||
tchu ッチュ
|
||||
te テ
|
||||
to ト
|
||||
tsu ッ
|
||||
tsu ツ
|
||||
tta ッタ
|
||||
tte ッテ
|
||||
tto ット
|
||||
ttsu ッツ
|
||||
u ゥ
|
||||
u ウ
|
||||
va ヴァ
|
||||
ve ヴェ
|
||||
vi ヴィ
|
||||
vo ヴォ
|
||||
vu ヴ
|
||||
vva ッヴァ
|
||||
vve ッヴェ
|
||||
vvi ッヴィ
|
||||
vvo ッヴォ
|
||||
vvu ッヴ
|
||||
wa ヮ
|
||||
wa ワ
|
||||
wo ヲ
|
||||
ya ャ
|
||||
ya ヤ
|
||||
yo ョ
|
||||
yo ヨ
|
||||
yu ュ
|
||||
yu ユ
|
||||
yya ッヤ
|
||||
yyo ッヨ
|
||||
yyu ッユ
|
||||
za ザ
|
||||
ze ゼ
|
||||
zo ゾ
|
||||
zu ズ
|
||||
zu ヅ
|
||||
zza ッザ
|
||||
zzo ッゾ
|
||||
zzu ッズ
|
||||
zzu ッヅ
|
||||
;;
|
||||
;; extended characters
|
||||
;;
|
||||
;;
|
||||
;; gairai terms
|
||||
;;
|
||||
all オール
|
||||
algrism アルゴリズム
|
||||
answer アンサー
|
||||
base ベース
|
||||
begineer ビギナー
|
||||
connection コネクション
|
||||
contents コンテンツ
|
||||
creator クリエーター
|
||||
comic コミック
|
||||
comics コミックス
|
||||
culture カルチャー
|
||||
debug デバッグ
|
||||
debugging デバッギング
|
||||
design デザイン
|
||||
digital デジタル
|
||||
dillenma ジレンマ
|
||||
directory ディレクトリ
|
||||
disk ディスク
|
||||
document ドキュメント
|
||||
download ダウンロード
|
||||
electric エレクトリック
|
||||
facebook フェイスブック
|
||||
firefox ファイアーフォックス
|
||||
folder フォルダ
|
||||
format フォーマット
|
||||
forum フォーラム
|
||||
fox フォックス
|
||||
free フリー
|
||||
gnome ノーム
|
||||
gnu グヌー
|
||||
gozilla ゴジラ
|
||||
guide ガイド
|
||||
harvard ハーバード
|
||||
help ヘルプ
|
||||
highlight ハイライト
|
||||
japan ジャパン
|
||||
journal ジャーナル
|
||||
library ライブラリ
|
||||
line ライン
|
||||
love ラヴ
|
||||
love ラブ
|
||||
mail メール
|
||||
main メイン
|
||||
mystery ミステリ
|
||||
mozilla モジラ
|
||||
network ネットワーク
|
||||
next ネクスト
|
||||
new ニュー
|
||||
news ニュース
|
||||
native ネイティブ
|
||||
online オンライン
|
||||
open オープン
|
||||
professional プロフェッショナル
|
||||
profile プロファイル
|
||||
programmer プログラマ
|
||||
sample サンプル
|
||||
series シリーズ
|
||||
share シェア
|
||||
social ソーシャル
|
||||
society ソサエティ
|
||||
software ソフトウエア
|
||||
source ソース
|
||||
street ストリート
|
||||
system システム
|
||||
tag タグ
|
||||
text テキスト
|
||||
thunderbird サンダーバード
|
||||
training トレーニング
|
||||
twitter ツイッター
|
||||
unicode ユニコード
|
||||
wall ウオール
|
||||
wall ウォール
|
||||
welcome ウェルカム
|
||||
welcome ウエルカム
|
||||
wikinomics ウィキノミクス
|
||||
york ヨーク
|
1798
src/calibre/ebooks/unihandecode/unicodepoints.py
Normal file
1798
src/calibre/ebooks/unihandecode/unicodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text. Transliterate
|
||||
unicode characters to ASCII.
|
||||
Decode unicode text to an ASCII representation of the text in Chinese.
|
||||
Transliterate unicode characters to ASCII based on chinese pronounce.
|
||||
|
||||
derived from John's unidecode library.
|
||||
|
||||
Copyright(c) 2009, John Schember
|
||||
|
||||
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
||||
is based on the perl module Text::Unidecode
|
||||
@ -55,29 +59,20 @@ it under the same terms as Perl itself.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
|
||||
from calibre.constants import preferred_encoding
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES
|
||||
|
||||
class Unidecoder(object):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
Tranliterate the string from unicode characters to ASCII.
|
||||
'''
|
||||
# The keys for CODEPOINTS is unicode characters, we want to be sure the
|
||||
# input text is unicode.
|
||||
if not isinstance(text, unicode):
|
||||
try:
|
||||
text = unicode(text)
|
||||
except:
|
||||
try:
|
||||
text = text.decode(preferred_encoding)
|
||||
except:
|
||||
text = text.decode('utf-8', 'replace')
|
||||
# Replace characters larger than 127 with their ASCII equivelent.
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
||||
text)
|
||||
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
@ -87,7 +82,7 @@ class Unidecoder(object):
|
||||
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||
# represents the position in the list of characters for the group.
|
||||
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
|
||||
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||
codepoint)]
|
||||
except:
|
||||
return '?'
|
||||
@ -97,12 +92,18 @@ class Unidecoder(object):
|
||||
Find what group character is a part of.
|
||||
'''
|
||||
# Code groups withing CODEPOINTS take the form 'xAB'
|
||||
return u'x%02x' % (ord(unicode(character)) >> 8)
|
||||
try:#python2
|
||||
return 'x%02x' % (ord(unicode(character)) >> 8)
|
||||
except:
|
||||
return 'x%02x' % (ord(character) >> 8)
|
||||
|
||||
def grouped_point(self, character):
|
||||
'''
|
||||
Return the location the replacement character is in the list for a
|
||||
the group character is a part of.
|
||||
'''
|
||||
return ord(unicode(character)) & 255
|
||||
try:#python2
|
||||
return ord(unicode(character)) & 255
|
||||
except:
|
||||
return ord(character) & 255
|
||||
|
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
23
src/calibre/ebooks/unihandecode/vndecoder.py
Normal file
23
src/calibre/ebooks/unihandecode/vndecoder.py
Normal file
@ -0,0 +1,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Vietnamese.
|
||||
|
||||
'''
|
||||
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
|
||||
class Vndecoder(Unidecoder):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -6,12 +6,12 @@ meaning as possible.
|
||||
import os
|
||||
from math import ceil
|
||||
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.constants import preferred_encoding, iswindows
|
||||
udc = Unidecoder()
|
||||
from calibre.utils.localization import get_udc
|
||||
|
||||
def ascii_text(orig):
|
||||
udc = get_udc()
|
||||
try:
|
||||
ascii = udc.decode(orig)
|
||||
except:
|
||||
|
@ -169,3 +169,13 @@ def set_qt_translator(translator):
|
||||
return translator.load(p)
|
||||
return False
|
||||
|
||||
_udc = None
|
||||
|
||||
def get_udc():
|
||||
global _udc
|
||||
if _udc is None:
|
||||
from calibre.ebooks.unihandecode import Unihandecoder
|
||||
_udc = Unihandecoder(lang=get_lang())
|
||||
return _udc
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user