Merge from trunk

This commit is contained in:
Charles Haley 2011-02-15 14:28:06 +00:00
commit 4dbf65009c
36 changed files with 146380 additions and 3331 deletions

View File

@ -193,6 +193,33 @@ License: GPL-3
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.
Files: src/calibre/ebooks/unihandecode/pykakasi/*
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 1992, Hironobu Takahashi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.
Files: resources/kanwadict2.db
Files: resources/itaijidict2.pickle
Copyright: 2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 1992 1993 1994, Hironobu Takahashi (takahasi@tiny.or.jp),
Copyright: 1992 1993 1994, Masahiko Sato (masahiko@sato.riec.tohoku.ac.jp),
Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.
Files: src/calibre/ebooks/unihandecode/*
Copyright: 2010-2011, Hiroshi Miura <miurahr@linux.com>
Copyright: 2009, John Schember
Copyright: 2007, Russell Norris
Copyright: 2001, Sean M. Burke
License: GPL-3, Perl
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.
Files: src/encutils/__init__.py
Copyright: 2005-2008: Christof Hoeke
License: LGPL-3+, CC-BY-3.0

View File

@ -13,12 +13,12 @@
id="Layer_1"
x="0px"
y="0px"
width="134.77701"
height="199.99901"
viewBox="0 0 134.777 199.999"
width="200"
height="200"
viewBox="0 0 199.99999 199.99999"
enable-background="new 0 0 595.28 841.89"
xml:space="preserve"
inkscape:version="0.47 r22583"
inkscape:version="0.48.0 r9654"
sodipodi:docname="news.svg"><metadata
id="metadata26"><rdf:RDF><cc:Work
rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
@ -38,22 +38,22 @@
guidetolerance="10"
inkscape:pageopacity="0"
inkscape:pageshadow="2"
inkscape:window-width="640"
inkscape:window-height="489"
inkscape:window-width="1680"
inkscape:window-height="997"
id="namedview22"
showgrid="false"
inkscape:zoom="0.28032165"
inkscape:cx="67.389001"
inkscape:cy="99.722002"
inkscape:window-x="0"
inkscape:window-y="41"
inkscape:window-maximized="0"
inkscape:window-x="-4"
inkscape:window-y="30"
inkscape:window-maximized="1"
inkscape:current-layer="Layer_1" />
<g
id="g3"
transform="translate(-230.25101,-320.668)">
transform="translate(-194.57771,-320.66701)">
<polygon
points="360.241,366.109 345.29,359.678 345.29,343.405 329.945,343.405 324.265,329.15 309.147,335.175 297.64,323.667 286.79,334.517 272.693,328.454 266.263,343.405 249.988,343.405 249.988,358.749 235.734,364.429 241.759,379.548 230.251,391.056 241.101,401.906 235.039,416.002 249.988,422.432 249.988,438.706 265.333,438.706 271.013,452.961 277.817,450.25 277.817,475.111 252.085,475.111 297.64,520.667 343.193,475.111 317.463,475.111 317.463,451.453 322.585,453.656 329.016,438.706 345.29,438.706 345.29,423.362 359.546,417.682 353.521,402.563 365.028,391.056 354.178,380.205 "
points="286.79,334.517 272.693,328.454 266.263,343.405 249.988,343.405 249.988,358.749 235.734,364.429 241.759,379.548 230.251,391.056 241.101,401.906 235.039,416.002 249.988,422.432 249.988,438.706 265.333,438.706 271.013,452.961 277.817,450.25 277.817,475.111 252.085,475.111 297.64,520.667 343.193,475.111 317.463,475.111 317.463,451.453 322.585,453.656 329.016,438.706 345.29,438.706 345.29,423.362 359.546,417.682 353.521,402.563 365.028,391.056 354.178,380.205 360.241,366.109 345.29,359.678 345.29,343.405 329.945,343.405 324.265,329.15 309.147,335.175 297.64,323.667 "
id="polygon5"
style="fill:#ffffff" />
<linearGradient
@ -73,7 +73,7 @@
id="stop10" />
</linearGradient>
<polygon
points="360.241,363.11 345.29,356.679 345.29,340.406 329.945,340.406 324.265,326.151 309.147,332.176 297.64,320.668 286.79,331.518 272.693,325.455 266.263,340.406 249.988,340.406 249.988,355.75 235.734,361.43 241.759,376.549 230.251,388.057 241.101,398.907 235.039,413.003 249.988,419.433 249.988,435.707 265.333,435.707 271.013,449.962 277.817,447.251 277.817,472.112 252.085,472.112 297.64,517.668 343.193,472.112 317.463,472.112 317.463,448.454 322.585,450.657 329.016,435.707 345.29,435.707 345.29,420.363 359.546,414.683 353.521,399.564 365.028,388.057 354.178,377.206 "
points="286.79,331.518 272.693,325.455 266.263,340.406 249.988,340.406 249.988,355.75 235.734,361.43 241.759,376.549 230.251,388.057 241.101,398.907 235.039,413.003 249.988,419.433 249.988,435.707 265.333,435.707 271.013,449.962 277.817,447.251 277.817,472.112 252.085,472.112 297.64,517.668 343.193,472.112 317.463,472.112 317.463,448.454 322.585,450.657 329.016,435.707 345.29,435.707 345.29,420.363 359.546,414.683 353.521,399.564 365.028,388.057 354.178,377.206 360.241,363.11 345.29,356.679 345.29,340.406 329.945,340.406 324.265,326.151 309.147,332.176 297.64,320.668 "
id="polygon12"
style="fill:url(#SVGID_1_)" />
<g
@ -81,14 +81,16 @@
<path
d="m 273.311,419.168 v -56.752 h 17.935 l 9.01,17.43 c 3.115,5.641 6.399,13.22 8.926,19.873 h 0.252 c -0.842,-7.494 -1.178,-15.41 -1.178,-23.83 v -13.472 h 13.893 v 56.752 H 306.15 l -9.684,-18.861 c -3.116,-5.978 -6.82,-13.641 -9.515,-20.461 h -0.336 c 0.42,7.663 0.589,16.167 0.589,25.345 v 13.978 h -13.893 z"
id="path16"
style="fill:#993720" />
style="fill:#993720"
inkscape:connector-curvature="0" />
</g>
<g
id="g18">
<path
d="m 273.311,416.873 v -56.752 h 17.935 l 9.01,17.43 c 3.115,5.641 6.399,13.22 8.926,19.873 h 0.252 c -0.842,-7.494 -1.178,-15.41 -1.178,-23.83 v -13.472 h 13.893 v 56.752 H 306.15 l -9.684,-18.861 c -3.116,-5.978 -6.82,-13.641 -9.515,-20.461 h -0.336 c 0.42,7.663 0.589,16.167 0.589,25.345 v 13.978 h -13.893 z"
id="path20"
style="fill:#f0efef" />
style="fill:#f0efef"
inkscape:connector-curvature="0" />
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 4.1 KiB

After

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 834 B

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
standaard.be
'''
@ -9,15 +9,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
class DeStandaard(BasicNewsRecipe):
title = u'De Standaard'
__author__ = u'Darko Miletic'
language = 'nl_BE'
language = 'nl_BE'
description = u'News from Belgium in Dutch'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.standaard.be/extra/css/images/masthead/logo_340x45.png'
publication_type = 'newspaper'
keep_only_tags = [dict(name='div' , attrs={'id':['intro','continued']})]
feeds = [(u'De Standaard Online', u'http://feeds.feedburner.com/dso-front')]
@ -27,4 +28,4 @@ class DeStandaard(BasicNewsRecipe):
return article.get('guid', None)
def print_version(self, url):
return url.replace('/Detail.aspx?','/PrintArtikel.aspx?')
return url.replace('/artikel/detail.aspx?','/Artikel/PrintArtikel.aspx?')

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
smh.com.au
'''
@ -22,7 +22,11 @@ class Smh_au(BasicNewsRecipe):
remove_empty_feeds = True
masthead_url = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg'
publication_type = 'newspaper'
extra_css = ' h1{font-family: Georgia,"Times New Roman",Times,serif } body{font-family: Arial,Helvetica,sans-serif} .cT-imageLandscape{font-size: x-small} '
extra_css = """
h1{font-family: Georgia,"Times New Roman",Times,serif }
body{font-family: Arial,Helvetica,sans-serif}
.cT-imageLandscape,.cT-imagePortrait{font-size: x-small}
"""
conversion_options = {
'comment' : description
@ -38,7 +42,11 @@ class Smh_au(BasicNewsRecipe):
]
remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})]
keep_only_tags = [dict(name='div',attrs={'id':'content'})]
remove_attributes = ['width','height']
remove_tags = [
dict(attrs={'class':'hidden'}),
dict(name=['link','meta','base','embed','object','iframe'])
]
remove_attributes = ['width','height','lang']
def parse_index(self):
articles = []
@ -66,3 +74,14 @@ class Smh_au(BasicNewsRecipe):
,'description':description
})
return [(self.tag_to_string(soup.find('title')), articles)]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('bod'):
item.name = 'div'
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -6,9 +6,10 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, cPickle
import os, cPickle, re, anydbm, shutil
from zlib import compress
from setup import Command, basenames
from setup import Command, basenames, __appname__
def get_opts_from_parser(parser):
def do_opt(opt):
@ -26,6 +27,9 @@ class Resources(Command):
description = 'Compile various needed calibre resources'
KAKASI_PATH = os.path.join(Command.SRC, __appname__,
'ebooks', 'unihandecode', 'pykakasi')
def run(self, opts):
scripts = {}
for x in ('console', 'gui'):
@ -101,11 +105,107 @@ class Resources(Command):
import json
json.dump(function_dict, open(dest, 'wb'), indent=4)
self.run_kakasi(opts)
def run_kakasi(self, opts):
self.records = {}
src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanwadict2.db')
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
if self.newer(dest, src):
self.info('\tGenerating Kanwadict')
for line in open(src, "r"):
self.parsekdict(line)
self.kanwaout(dest)
src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','itaijidict2.pickle')
if self.newer(dest, src):
self.info('\tGenerating Itaijidict')
self.mkitaiji(src, dest)
src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanadict2.pickle')
if self.newer(dest, src):
self.info('\tGenerating kanadict')
self.mkkanadict(src, dest)
return
def mkitaiji(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
dic[pair[0]] = pair[1]
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
def mkkanadict(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
(alpha, kana) = line.split(' ')
dic[kana] = alpha
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
def parsekdict(self, line):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
return
(yomi, kanji) = line.split(' ')
if ord(yomi[-1:]) <= ord('z'):
tail = yomi[-1:]
yomi = yomi[:-1]
else:
tail = ''
self.updaterec(kanji, yomi, tail)
def updaterec(self, kanji, yomi, tail):
key = "%04x"%ord(kanji[0])
if key in self.records:
if kanji in self.records[key]:
rec = self.records[key][kanji]
rec.append((yomi,tail))
self.records[key].update( {kanji: rec} )
else:
self.records[key][kanji]=[(yomi, tail)]
else:
self.records[key] = {}
self.records[key][kanji]=[(yomi, tail)]
def kanwaout(self, out):
dic = anydbm.open(out, 'c')
for (k, v) in self.records.iteritems():
dic[k] = compress(cPickle.dumps(v, -1))
dic.close()
def clean(self):
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
x = self.j(self.RESOURCES, x+'.pickle')
if os.path.exists(x):
os.remove(x)
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
if os.path.exists(kakasi):
shutil.rmtree(kakasi)

View File

@ -152,8 +152,17 @@ def check_ebook_format(stream, current_guess):
stream.seek(0)
return ans
def normalize(x):
if isinstance(x, unicode):
import unicodedata
x = unicodedata.normalize('NFKC', x)
return x
def calibre_cover(title, author_string, series_string=None,
output_format='jpg', title_size=46, author_size=36):
title = normalize(title)
author_string = normalize(author_string)
series_string = normalize(series_string)
from calibre.utils.magick.draw import create_cover_page, TextLine
lines = [TextLine(title, title_size), TextLine(author_string, author_size)]
if series_string:

View File

@ -402,8 +402,8 @@ OptionRecommendation(name='asciiize',
'with "Mikhail Gorbachiov". Also, note that in '
'cases where there are multiple representations of a character '
'(characters shared by Chinese and Japanese for instance) the '
'representation used by the largest number of people will be '
'used (Chinese in the previous example).')%\
'representation based on the current calibre interface language will be '
'used.')%\
u'\u041c\u0438\u0445\u0430\u0438\u043b '
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
)

View File

@ -543,9 +543,9 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False):
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
html = unidecoder.decode(html)
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
@ -557,10 +557,10 @@ class HTMLPreProcessor(object):
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char)
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)
return html

View File

@ -12,6 +12,7 @@ __docformat__ = 'restructuredtext en'
from struct import pack, unpack
from cStringIO import StringIO
from calibre.ebooks import normalize
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
from calibre.ebooks.mobi.langcodes import iana2mobi
@ -311,6 +312,7 @@ class MetadataUpdater(object):
return StreamSlicer(self.stream, start, stop)
def update(self, mi):
mi.title = normalize(mi.title)
def update_exth_record(rec):
recs.append(rec)
if rec[0] in self.original_exth_records:
@ -331,12 +333,12 @@ class MetadataUpdater(object):
kindle_pdoc = None
if mi.author_sort and pas:
authors = mi.author_sort
update_exth_record((100, authors.encode(self.codec, 'replace')))
update_exth_record((100, normalize(authors).encode(self.codec, 'replace')))
elif mi.authors:
authors = ';'.join(mi.authors)
update_exth_record((100, authors.encode(self.codec, 'replace')))
update_exth_record((100, normalize(authors).encode(self.codec, 'replace')))
if mi.publisher:
update_exth_record((101, mi.publisher.encode(self.codec, 'replace')))
update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace')))
if mi.comments:
# Strip user annotations
a_offset = mi.comments.find('<div class="user_annotations">')
@ -345,12 +347,12 @@ class MetadataUpdater(object):
mi.comments = mi.comments[:a_offset]
if ad_offset >= 0:
mi.comments = mi.comments[:ad_offset]
update_exth_record((103, mi.comments.encode(self.codec, 'replace')))
update_exth_record((103, normalize(mi.comments).encode(self.codec, 'replace')))
if mi.isbn:
update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
if mi.tags:
subjects = '; '.join(mi.tags)
update_exth_record((105, subjects.encode(self.codec, 'replace')))
update_exth_record((105, normalize(subjects).encode(self.codec, 'replace')))
if kindle_pdoc and kindle_pdoc in mi.tags:
update_exth_record((501, str('PDOC')))

View File

@ -14,8 +14,9 @@ import re
from struct import pack
import time
from urlparse import urldefrag
from cStringIO import StringIO
from calibre.ebooks import normalize
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.oeb.base import OEB_DOCS
@ -1365,7 +1366,7 @@ class MobiWriter(object):
self._text_length,
self._text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
uid = random.randint(0, 0xffffffff)
title = unicode(metadata.title[0]).encode('utf-8')
title = normalize(unicode(metadata.title[0])).encode('utf-8')
# The MOBI Header
# 0x0 - 0x3
@ -1523,12 +1524,12 @@ class MobiWriter(object):
items = oeb.metadata[term]
if term == 'creator':
if self._prefer_author_sort:
creators = [unicode(c.file_as or c) for c in items]
creators = [normalize(unicode(c.file_as or c)) for c in items]
else:
creators = [unicode(c) for c in items]
creators = [normalize(unicode(c)) for c in items]
items = ['; '.join(creators)]
for item in items:
data = self.COLLAPSE_RE.sub(' ', unicode(item))
data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item)))
if term == 'identifier':
if data.lower().startswith('urn:isbn:'):
data = data[9:]
@ -1542,7 +1543,7 @@ class MobiWriter(object):
nrecs += 1
if term == 'rights' :
try:
rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8')
except:
rights = 'Unknown'
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
__all__ = ["Unihandecoder"]
'''
Decode unicode text to an ASCII representation of the text.
Translate unicode characters to ASCII.
Inspired from John Schember's unidecode library which was created as part
of calibre.
Copyright(c) 2009, John Schember
Tranliterate the string from unicode characters to ASCII in Chinese and others.
'''
import unicodedata
class Unihandecoder(object):
preferred_encoding = None
decoder = None
def __init__(self, lang="zh", encoding='utf-8'):
self.preferred_encoding = encoding
lang = lang.lower()
if lang[:2] == u'ja':
from calibre.ebooks.unihandecode.jadecoder import Jadecoder
self.decoder = Jadecoder()
elif lang[:2] == u'kr' or lang == u'korean':
from calibre.ebooks.unihandecode.krdecoder import Krdecoder
self.decoder = Krdecoder()
elif lang[:2] == u'vn' or lang == u'vietnum':
from calibre.ebooks.unihandecode.vndecoder import Vndecoder
self.decoder = Vndecoder()
else: #zh and others
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
self.decoder = Unidecoder()
def decode(self, text):
try:
unicode # python2
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(self.preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
except: # python3, str is unicode
pass
#at first unicode normalize it. (see Unicode standards)
ntext = unicodedata.normalize('NFKC', text)
return self.decoder.decode(ntext)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,41 @@
# coding:utf8
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text for Japanese.
Translate unicode string to ASCII roman string.
API is based on the python unidecode,
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
and perl module Text::Unidecode
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
This functionality is owned by Kakasi Japanese processing engine.
Copyright (c) 2010 Hiroshi Miura
'''
import re
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
class Jadecoder(Unidecoder):
kakasi = None
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(JACODES)
self.kakasi = kakasi()
def decode(self, text):
try:
result=self.kakasi.do(text)
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
except:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text in Korean.
Based on unidecoder.
'''
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
class Krdecoder(Unidecoder):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)

View File

@ -0,0 +1,5 @@
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
kakasi
__all__ = ["pykakasi"]

View File

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
# h2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
class H2a (object):
H2a_table = {
u"\u3041":"a", u"\u3042":"a",
u"\u3043":"i", u"\u3044":"i",
u"\u3045":"u", u"\u3046":"u",
u"\u3046\u309b":"vu", u"\u3046\u309b\u3041":"va",
u"\u3046\u309b\u3043":"vi", u"\u3046\u309b\u3047":"ve",
u"\u3046\u309b\u3049":"vo",
u"\u3047":"e", u"\u3048":"e",
u"\u3049":"o", u"\u304a":"o",
u"\u304b":"ka", u"\u304c":"ga",
u"\u304d":"ki", u"\u304d\u3041":"kya",
u"\u304d\u3045":"kyu", u"\u304d\u3049":"kyo",
u"\u304e":"gi", u"\u3050\u3083":"gya",
u"\u304e\u3045":"gyu", u"\u304e\u3087":"gyo",
u"\u304f":"ku", u"\u3050":"gu",
u"\u3051":"ke", u"\u3052":"ge",
u"\u3053":"ko", u"\u3054":"go",
u"\u3055":"sa", u"\u3056":"za",
u"\u3057":"shi", u"\u3057\u3083":"sha",
u"\u3057\u3085":"shu", u"\u3057\u3087":"sho",
u"\u3058":"ji", u"\u3058\u3083":"ja",
u"\u3058\u3085":"ju", u"\u3058\u3087":"jo",
u"\u3059":"su", u"\u305a":"zu",
u"\u305b":"se", u"\u305c":"ze",
u"\u305d":"so", u"\u305e":"zo",
u"\u305f":"ta", u"\u3060":"da",
u"\u3061":"chi", u"\u3061\u3047":"che", u"\u3061\u3083":"cha",
u"\u3061\u3085":"chu", u"\u3061\u3087":"cho",
u"\u3062":"ji", u"\u3062\u3083":"ja",
u"\u3062\u3085":"ju", u"\u3062\u3087":"jo",
u"\u3063":"tsu",
u"\u3063\u3046\u309b":"vvu",
u"\u3063\u3046\u309b\u3041":"vva",
u"\u3063\u3046\u309b\u3043":"vvi",
u"\u3063\u3046\u309b\u3047":"vve",
u"\u3063\u3046\u309b\u3049":"vvo",
u"\u3063\u304b":"kka", u"\u3063\u304c":"gga",
u"\u3063\u304d":"kki", u"\u3063\u304d\u3083":"kkya",
u"\u3063\u304d\u3085":"kkyu", u"\u3063\u304d\u3087":"kkyo",
u"\u3063\u304e":"ggi", u"\u3063\u304e\u3083":"ggya",
u"\u3063\u304e\u3085":"ggyu", u"\u3063\u304e\u3087":"ggyo",
u"\u3063\u304f":"kku", u"\u3063\u3050":"ggu",
u"\u3063\u3051":"kke", u"\u3063\u3052":"gge",
u"\u3063\u3053":"kko", u"\u3063\u3054":"ggo",
u"\u3063\u3055":"ssa", u"\u3063\u3056":"zza",
u"\u3063\u3057":"sshi", u"\u3063\u3057\u3083":"ssha",
u"\u3063\u3057\u3085":"sshu", u"\u3063\u3057\u3087":"ssho",
u"\u3063\u3058":"jji", u"\u3063\u3058\u3083":"jja",
u"\u3063\u3058\u3085":"jju", u"\u3063\u3058\u3087":"jjo",
u"\u3063\u3059":"ssu", u"\u3063\u305a":"zzu",
u"\u3063\u305b":"sse", u"\u3063\u305e":"zze",
u"\u3063\u305d":"sso", u"\u3063\u305e":"zzo",
u"\u3063\u305f":"tta", u"\u3063\u3060":"dda",
u"\u3063\u3061":"tchi", u"\u3063\u3061\u3083":"tcha",
u"\u3063\u3061\u3085":"tchu", u"\u3063\u3061\u3087":"tcho",
u"\u3063\u3062":"jji", u"\u3063\u3062\u3083":"jjya",
u"\u3063\u3062\u3085":"jjyu", u"\u3063\u3062\u3087":"jjyo",
u"\u3063\u3064":"ttsu", u"\u3063\u3065":"zzu",
u"\u3063\u3066":"tte", u"\u3063\u3067":"dde",
u"\u3063\u3068":"tto", u"\u3063\u3069":"ddo",
u"\u3063\u306f":"hha", u"\u3063\u3070":"bba",
u"\u3063\u3071":"ppa",
u"\u3063\u3072":"hhi", u"\u3063\u3072\u3083":"hhya",
u"\u3063\u3072\u3085":"hhyu", u"\u3063\u3072\u3087":"hhyo",
u"\u3063\u3073":"bbi", u"\u3063\u3073\u3083":"bbya",
u"\u3063\u3073\u3085":"bbyu", u"\u3063\u3073\u3087":"bbyo",
u"\u3063\u3074":"ppi", u"\u3063\u3074\u3083":"ppya",
u"\u3063\u3074\u3085":"ppyu", u"\u3063\u3074\u3087":"ppyo",
u"\u3063\u3075":"ffu", u"\u3063\u3075\u3041":"ffa",
u"\u3063\u3075\u3043":"ffi", u"\u3063\u3075\u3047":"ffe",
u"\u3063\u3075\u3049":"ffo",
u"\u3063\u3076":"bbu", u"\u3063\u3077":"ppu",
u"\u3063\u3078":"hhe", u"\u3063\u3079":"bbe",
u"\u3063\u307a":"ppe",
u"\u3063\u307b":"hho", u"\u3063\u307c":"bbo",
u"\u3063\u307d":"ppo",
u"\u3063\u3084":"yya", u"\u3063\u3086":"yyu",
u"\u3063\u3088":"yyo",
u"\u3063\u3089":"rra", u"\u3063\u308a":"rri",
u"\u3063\u308a\u3083":"rrya", u"\u3063\u308a\u3085":"rryu",
u"\u3063\u308a\u3087":"rryo",
u"\u3063\u308b":"rru", u"\u3063\u308c":"rre",
u"\u3063\u308d":"rro",
u"\u3064":"tsu", u"\u3065":"zu",
u"\u3066":"te", u"\u3067":"de", u"\u3067\u3043":"di",
u"\u3068":"to", u"\u3069":"do",
u"\u306a":"na",
u"\u306b":"ni", u"\u306b\u3083":"nya",
u"\u306b\u3085":"nyu", u"\u306b\u3087":"nyo",
u"\u306c":"nu", u"\u306d":"ne", u"\u306e":"no",
u"\u306f":"ha", u"\u3070":"ba", u"\u3071":"pa",
u"\u3072":"hi", u"\u3072\u3083":"hya",
u"\u3072\u3085":"hyu", u"\u3072\u3087":"hyo",
u"\u3073":"bi", u"\u3073\u3083":"bya",
u"\u3073\u3085":"byu", u"\u3073\u3087":"byo",
u"\u3074":"pi", u"\u3074\u3083":"pya",
u"\u3074\u3085":"pyu", u"\u3074\u3087":"pyo",
u"\u3075":"fu", u"\u3075\u3041":"fa",
u"\u3075\u3043":"fi", u"\u3075\u3047":"fe",
u"\u3075\u3049":"fo",
u"\u3076":"bu", u"\u3077":"pu",
u"\u3078":"he", u"\u3079":"be", u"\u307a":"pe",
u"\u307b":"ho", u"\u307c":"bo", u"\u307d":"po",
u"\u307e":"ma",
u"\u307f":"mi", u"\u307f\u3083":"mya",
u"\u307f\u3085":"myu", u"\u307f\u3087":"myo",
u"\u3080":"mu", u"\u3081":"me", u"\u3082":"mo",
u"\u3083":"ya", u"\u3084":"ya",
u"\u3085":"yu", u"\u3086":"yu",
u"\u3087":"yo", u"\u3088":"yo",
u"\u3089":"ra",
u"\u308a":"ri", u"\u308a\u3083":"rya",
u"\u308a\u3085":"ryu", u"\u308a\u3087":"ryo",
u"\u308b":"ru", u"\u308c":"re", u"\u308d":"ro",
u"\u308e":"wa", u"\u308f":"wa",
u"\u3090":"i", u"\u3091":"e",
u"\u3092":"wo", u"\u3093":"n",
u"\u3093\u3042":"n'a", u"\u3093\u3044":"n'i",
u"\u3093\u3046":"n'u", u"\u3093\u3048":"n'e",
u"\u3093\u304a":"n'o",
}
# this class is Borg
_shared_state = {}
def __new__(cls, *p, **k):
self = object.__new__(cls, *p, **k)
self.__dict__ = cls._shared_state
return self
def isHiragana(self, char):
return ( 0x3040 < ord(char) and ord(char) < 0x3094)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(4, len(text)+1)
for x in xrange(r):
if text[:x] in self.H2a_table:
if max_len < x:
max_len = x
Hstr = self.H2a_table[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,564 @@
芦蘆
壱一
苅刈
舘館
曽曾
菟兎
島嶋
盃杯
冨富
峯峰
亘亙
弌一
乘乗
亂乱
豫予
亊事
弍二
亞亜
亰京
从従
仭仞
佛仏
來来
儘侭
伜倅
假仮
會会
做作
傳伝
僞偽
價価
儉倹
兒児
兔兎
竸競
兩両
囘回
册冊
冢塚
冩写
决決
冱冴
冰氷
况況
凉涼
處処
凾函
刄刃
刔抉
刧劫
剩剰
劍剣
劔剣
劒剣
剱剣
劑剤
辨弁
勞労
勳勲
勵励
勸勧
區区
卆卒
丗世
凖準
夘卯
卻却
卷巻
厠廁
厦廈
厮廝
厰廠
參参
雙双
咒呪
單単
噐器
營営
嚏嚔
嚴厳
囑嘱
囓齧
圀国
圈圏
國国
圍囲
圓円
團団
圖図
埀垂
埓埒
塲場
壞壊
墮堕
壓圧
壘塁
壥廛
壤壌
壯壮
壺壷
壹一
壻婿
壽寿
夂夊
夛多
梦夢
竒奇
奧奥
奬奨
侫佞
姙妊
嫻嫺
孃嬢
學学
斈学
寃冤
寇冦
寢寝
寫写
寶宝
寳宝
尅剋
將将
專専
對対
尓爾
尢尤
屆届
屬属
峽峡
嶌嶋
嵜崎
崙崘
嵳嵯
嶽岳
巛川
巵卮
帋紙
帶帯
幤幣
廐厩
廏厩
廣広
廚厨
廢廃
廳庁
廰庁
廸迪
弃棄
弉奘
彜彝
彈弾
彌弥
弯彎
徃往
徑径
從従
徠来
悳徳
恠怪
恆恒
悧俐
惡悪
惠恵
忰悴
惱悩
愼慎
愽博
慘惨
慚慙
憇憩
應応
懷懐
懴懺
戀恋
戞戛
戰戦
戲戯
拔抜
拏拿
擔担
拜拝
拂払
挾挟
搜捜
插挿
搖揺
攝摂
攪撹
據拠
擇択
擧拳
舉拳
抬擡
擴拡
攜携
攵攴
攷考
收収
效効
敕勅
敍叙
敘叙
數数
變変
斷断
旙旛
昜陽
晄晃
晉晋
晝昼
晰晢
暎映
曉暁
暸瞭
昿曠
曵曳
朖朗
朞期
霸覇
杤栃
杰傑
枩松
檜桧
條条
檮梼
梹檳
棊棋
棧桟
棕椶
楙茂
榮栄
槨椁
樂楽
權権
樞枢
樣様
樓楼
橢楕
檢検
櫻桜
鬱欝
盜盗
飮飲
歐嘔
歡歓
歸帰
殘残
殱殲
殼殻
毆殴
毓育
氣気
沒没
泪涙
濤涛
渕淵
渊淵
淨浄
淺浅
滿満
溂剌
溪渓
灌潅
滯滞
澁渋
澀渋
潛潜
濳潜
澂澄
澑溜
澤沢
濟済
濕湿
濱浜
濾滬
灣湾
烱炯
烟煙
熈煕
熏燻
燒焼
爐炉
爭争
爲為
爼俎
犁犂
犹猶
犲豺
狹狭
獎奨
默黙
獨独
獸獣
獵猟
獻献
珎珍
璢瑠
瑯琅
珱瓔
瓣弁
甞嘗
甼町
畄留
畍界
畊耕
畆畝
畧略
畫画
當当
畴疇
疊畳
疉畳
疂畳
癡痴
發発
皃猊
皈帰
皹皸
盖蓋
盡尽
蘯盪
眞真
眦眥
礦鉱
礪砺
碎砕
碯瑙
祕秘
祿禄
齋斎
禪禅
禮礼
禀稟
稱称
稻稲
稾稿
穗穂
穩穏
龝穐
穰穣
窗窓
竈竃
窰窯
竊窃
竝並
筺筐
笋筍
箟箘
筝箏
簔蓑
籠篭
籘籐
籖籤
粹粋
糺糾
絲糸
經経
總総
緜綿
縣県
縱縦
繪絵
繩縄
繼継
緕纃
續続
纖繊
纎繊
纜繿
缺欠
罐缶
罸罰
羃冪
羣群
羮羹
譱善
翆翠
翦剪
耻恥
聟婿
聨聯
聲声
聰聡
聽聴
肅粛
冐冒
脉脈
腦脳
腟膣
膓腸
膸髄
膽胆
臈臘
臟臓
臺台
與与
舊旧
舍舎
舖舗
舩船
艢檣
舮艫
艷艶
莖茎
莊荘
莵兎
菷帚
萠萌
蕚萼
蒂蔕
萬万
葢蓋
蘂蕊
蕋蕊
藪薮
藏蔵
藝芸
藥薬
蘓蘇
乕虎
號号
蠣蛎
蝨虱
蠅蝿
螢蛍
蟆蟇
蟲虫
蠏蟹
蟷螳
蟒蠎
蠶蚕
蠧蠹
蠻蛮
衂衄
衞衛
袵衽
裝装
襃褒
褝襌
覩睹
覺覚
覽覧
觀観
觧解
觸触
誡戒
謌歌
諡謚
謠謡
證証
譛譖
譯訳
譽誉
讀読
讓譲
讚賛
豐豊
貉狢
貍狸
貎猊
豼貔
貘獏
戝財
貭質
貳弐
貮弐
賤賎
賣売
贊賛
賍贓
赱走
踈疎
踴踊
躰体
軆体
軈軅
軣轟
輕軽
輙輒
輌輛
轉転
辭辞
辯弁
迯逃
逹達
逎遒
遞逓
遲遅
邊辺
邉辺
邨村
鄰隣
醉酔
醫医
釀醸
釋釈
釡釜
釼剣
銕鉄
錢銭
鎭鎮
鐵鉄
鐡鉄
鑒鑑
鑄鋳
鑛鉱
鈩鑪
鑚鑽
閇閉
濶闊
關関
阯址
陷陥
險険
隱隠
隸隷
襍雑
雜雑
靈霊
靜静
靱靭
韭韮
韲齏
韵韻
顏顔
顯顕
飃飄
餘余
餝飾
餠餅
騷騒
驅駆
驛駅
驗験
髓髄
體体
髮髪
鬪闘
鰺鯵
鰛鰮
鳬鳧
鳫鴈
鵄鴟
鵞鵝
鷄鶏
鷏鷆
鹽塩
麥麦
麸麩
麪麺
點点
黨党
皷鼓
鼡鼠
齊斉
齒歯
齡齢
龜亀
槇槙
遙遥
瑤瑶
凜凛
熙煕

View File

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
# j2h.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
import re
class J2H (object):
kanwa = None
cl_table = [
"","aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow", "aiueow",
"aiueow", "aiueow", "aiueow", "k", "g", "k", "g", "k", "g", "k", "g", "k",
"g", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "s", "zj", "t", "d", "tc",
"d", "aiueokstchgzjfdbpw", "t", "d", "t", "d", "t", "d", "n", "n", "n", "n",
"n", "h", "b", "p", "h", "b", "p", "hf", "b", "p", "h", "b", "p", "h", "b",
"p", "m", "m", "m", "m", "m", "y", "y", "y", "y", "y", "y", "rl", "rl",
"rl", "rl", "rl", "wiueo", "wiueo", "wiueo", "wiueo", "w", "n", "v", "k",
"k", "", "", "", "", "", "", "", "", ""]
def __init__(self):
self.kanwa = jisyo()
def isKanji(self, c):
return ( 0x3400 <= ord(c) and ord(c) < 0xfa2e)
def isCletter(self, l, c):
if (ord(u"") <= ord(c) and ord(c) <= 0x309f) and ( l in self.cl_table[ord(c) - ord(u"")-1]):
return True
return False
def itaiji_conv(self, text):
r = []
for c in text:
if c in self.kanwa.itaijidict:
r.append(c)
for c in r:
text = re.sub(c, self.kanwa.itaijidict[c], text)
return text
def convert(self, text):
max_len = 0
Hstr = ""
table = self.kanwa.load_jisyo(text[0])
if table is None:
return ("", 0)
for (k,v) in table.iteritems():
length = len(k)
if len(text) >= length:
if text.startswith(k):
for (yomi, tail) in v:
if tail is '':
if max_len < length:
Hstr = yomi
max_len = length
elif max_len < length+1 and len(text) > length and self.isCletter(tail, text[length]):
Hstr=''.join([yomi,text[length]])
max_len = length+1
return (Hstr, max_len)

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# jisyo.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
from cPickle import load
import anydbm,marshal
from zlib import decompress
import os
import calibre.utils.resources as resources
class jisyo (object):
kanwadict = None
itaijidict = None
kanadict = None
jisyo_table = {}
# this class is Borg
_shared_state = {}
def __new__(cls, *p, **k):
self = object.__new__(cls, *p, **k)
self.__dict__ = cls._shared_state
return self
def __init__(self):
if self.kanwadict is None:
dictpath = resources.get_path(os.path.join('localization','pykakasi','kanwadict2.db'))
self.kanwadict = anydbm.open(dictpath,'r')
if self.itaijidict is None:
itaijipath = resources.get_path(os.path.join('localization','pykakasi','itaijidict2.pickle'))
itaiji_pkl = open(itaijipath, 'rb')
self.itaijidict = load(itaiji_pkl)
if self.kanadict is None:
kanadictpath = resources.get_path(os.path.join('localization','pykakasi','kanadict2.pickle'))
kanadict_pkl = open(kanadictpath, 'rb')
self.kanadict = load(kanadict_pkl)
def load_jisyo(self, char):
try:#python2
key = "%04x"%ord(unicode(char))
except:#python3
key = "%04x"%ord(char)
try: #already exist?
table = self.jisyo_table[key]
except:
try:
table = self.jisyo_table[key] = marshal.loads(decompress(self.kanwadict[key]))
except:
return None
return table

View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
# k2a.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.jisyo import jisyo
class K2a (object):
kanwa = None
def __init__(self):
self.kanwa = jisyo()
def isKatakana(self, char):
return ( 0x30a0 < ord(char) and ord(char) < 0x30f7)
def convert(self, text):
Hstr = ""
max_len = -1
r = min(10, len(text)+1)
for x in xrange(r):
if text[:x] in self.kanwa.kanadict:
if max_len < x:
max_len = x
Hstr = self.kanwa.kanadict[text[:x]]
return (Hstr, max_len)

View File

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
# kakasi.py
#
# Copyright 2011 Hiroshi Miura <miurahr@linux.com>
#
# Original Copyright:
# * KAKASI (Kanji Kana Simple inversion program)
# * $Id: jj2.c,v 1.7 2001-04-12 05:57:34 rug Exp $
# * Copyright (C) 1992
# * Hironobu Takahashi (takahasi@tiny.or.jp)
# *
# * This program is free software; you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation; either versions 2, or (at your option)
# * any later version.
# *
# * This program is distributed in the hope that it will be useful
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with KAKASI, see the file COPYING. If not, write to the Free
# * Software Foundation Inc., 59 Temple Place - Suite 330, Boston, MA
# * 02111-1307, USA.
# */
from calibre.ebooks.unihandecode.pykakasi.j2h import J2H
from calibre.ebooks.unihandecode.pykakasi.h2a import H2a
from calibre.ebooks.unihandecode.pykakasi.k2a import K2a
class kakasi(object):
j2h = None
h2a = None
k2a = None
def __init__(self):
self.j2h = J2H()
self.h2a = H2a()
self.k2a = K2a()
def do(self, text):
otext = ''
i = 0
while True:
if i >= len(text):
break
if self.j2h.isKanji(text[i]):
(t, l) = self.j2h.convert(text[i:])
if l <= 0:
otext = otext + text[i]
i = i + 1
continue
i = i + l
m = 0
tmptext = ""
while True:
if m >= len(t):
break
(s, n) = self.h2a.convert(t[m:])
if n <= 0:
break
m = m + n
tmptext = tmptext+s
if i >= len(text):
otext = otext + tmptext.capitalize()
else:
otext = otext + tmptext.capitalize() +' '
elif self.h2a.isHiragana(text[i]):
tmptext = ''
while True:
(t, l) = self.h2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.h2a.isHiragana(text[i]):
otext = otext + tmptext + ' '
break
elif self.k2a.isKatakana(text[i]):
tmptext = ''
while True:
(t, l) = self.k2a.convert(text[i:])
tmptext = tmptext+t
i = i + l
if i >= len(text):
otext = otext + tmptext
break
elif not self.k2a.isKatakana(text[i]):
otext = otext + tmptext + ' '
break
else:
otext = otext + text[i]
i += 1
return otext

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,317 @@
;; Kana-Alphabet mapping dictionary
;;
;; To use this mapping table,
;; you should unicode normalize NKFC form.
;;
;; basic mapping
;;
a ァ
a ア
ba バ
bba ッバ
bbe ッベ
bbi ッビ
bbo ッボ
bbu ッブ
bbya ッビャ
bbyo ッビョ
bbyu ッビュ
be ベ
bi ビ
bo ボ
bu ブ
bya ビャ
byo ビョ
byu ビュ
cha チャ
che チェ
chi チ
cho チョ
chu チュ
da ダ
dda ッダ
dde ッデ
ddo ッド
de デ
di ディ
do ド
e ェ
e エ
e ヱ
fa ファ
fe フェ
ffa ッファ
ffe ッフェ
ffi ッフィ
ffo ッフォ
ffu ッフ
fi フィ
fo フォ
fu フ
ga ガ
ge ゲ
gga ッガ
gge ッゲ
ggi ッギ
ggo ッゴ
ggu ッグ
ggya ッギャ
ggyo ッギョ
ggyu ッギュ
gi ギ
go ゴ
gu グ
gya グャ
gyo ギョ
gyu ギゥ
ha ハ
he ヘ
hha ッハ
hhe ッヘ
hhi ッヒ
hho ッホ
hhya ッヒャ
hhyo ッヒョ
hhyu ッヒュ
hi ヒ
ho ホ
hya ヒャ
hyo ヒョ
hyu ヒュ
i ィ
i イ
i ヰ
ja ジャ
ja ヂャ
ji ジ
ji ヂ
jja ッジャ
jji ッジ
jji ッヂ
jjo ッジョ
jju ッジュ
jjya ッヂャ
jjyo ッヂョ
jjyu ッヂュ
jo ジョ
jo ヂョ
ju ジュ
ju ヂュ
ka カ
ka ヵ
ke ケ
ke ヶ
ki キ
kka ッカ
kke ッケ
kki ッキ
kko ッコ
kku ック
kkya ッキャ
kkyo ッキョ
kkyu ッキュ
ko コ
ku ク
kya キァ
kyo キォ
kyu キゥ
ma マ
me メ
mi ミ
mo モ
mu ム
mya ミャ
myo ミョ
myu ミュ
n ン
n'a ンア
n'e ンエ
n'i ンイ
n'o ンオ
n'u ンウ
na ナ
ne ネ
ni ニ
no
nu ヌ
nya ニャ
nyo ニョ
nyu ニュ
o ォ
o オ
pa パ
pe ペ
pi ピ
po ポ
ppa ッパ
ppe ッペ
ppi ッピ
ppo ッポ
ppu ップ
ppya ッピャ
ppyo ッピョ
ppyu ッピュ
pu プ
pya ピャ
pyo ピョ
pyu ピュ
ra ラ
re レ
ri リ
ro ロ
rra ッラ
rre ッレ
rri ッリ
rro ッロ
rru ッル
rrya ッリャ
rryo ッリョ
rryu ッリュ
ru ル
rya リャ
ryo リョ
ryu リュ
sa サ
se セ
sha シャ
shi シ
sho ショ
shu シュ
so ソ
ssa ッサ
sse ッセ
ssha ッシャ
sshi ッシ
ssho ッショ
sshu ッシュ
sso ッソ
ssu ッス
su ス
ta タ
tcha ッチャ
tchi ッチ
tcho ッチョ
tchu ッチュ
te テ
to ト
tsu ッ
tsu ツ
tta ッタ
tte ッテ
tto ット
ttsu ッツ
u ゥ
u ウ
va ヴァ
ve ヴェ
vi ヴィ
vo ヴォ
vu ヴ
vva ッヴァ
vve ッヴェ
vvi ッヴィ
vvo ッヴォ
vvu ッヴ
wa ヮ
wa ワ
wo ヲ
ya ャ
ya ヤ
yo ョ
yo ヨ
yu ュ
yu ユ
yya ッヤ
yyo ッヨ
yyu ッユ
za ザ
ze ゼ
zo ゾ
zu ズ
zu ヅ
zza ッザ
zzo ッゾ
zzu ッズ
zzu ッヅ
;;
;; extended characters
;;
;;
;; gairai terms
;;
all オール
algrism アルゴリズム
answer アンサー
base ベース
begineer ビギナー
connection コネクション
contents コンテンツ
creator クリエーター
comic コミック
comics コミックス
culture カルチャー
debug デバッグ
debugging デバッギング
design デザイン
digital デジタル
dillenma ジレンマ
directory ディレクトリ
disk ディスク
document ドキュメント
download ダウンロード
electric エレクトリック
facebook フェイスブック
firefox ファイアーフォックス
folder フォルダ
format フォーマット
forum フォーラム
fox フォックス
free フリー
gnome ノーム
gnu グヌー
gozilla ゴジラ
guide ガイド
harvard ハーバード
help ヘルプ
highlight ハイライト
japan ジャパン
journal ジャーナル
library ライブラリ
line ライン
love ラヴ
love ラブ
mail メール
main メイン
mystery ミステリ
mozilla モジラ
network ネットワーク
next ネクスト
new ニュー
news ニュース
native ネイティブ
online オンライン
open オープン
professional プロフェッショナル
profile プロファイル
programmer プログラマ
sample サンプル
series シリーズ
share シェア
social ソーシャル
society ソサエティ
software ソフトウエア
source ソース
street ストリート
system システム
tag タグ
text テキスト
thunderbird サンダーバード
training トレーニング
twitter ツイッター
unicode ユニコード
wall ウオール
wall ウォール
welcome ウェルカム
welcome ウエルカム
wikinomics ウィキノミクス
york ヨーク

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,17 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text. Transliterate
unicode characters to ASCII.
Decode unicode text to an ASCII representation of the text in Chinese.
Transliterate unicode characters to ASCII based on chinese pronounce.
Derived from John Schember's unidecode library. Which was created
as part of calibre.
Copyright(c) 2009, John Schember <john@nachtimwald.com>
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
is based on the perl module Text::Unidecode
@ -55,29 +60,20 @@ it under the same terms as Perl itself.
'''
import re
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
from calibre.constants import preferred_encoding
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES
class Unidecoder(object):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)
def decode(self, text):
'''
Tranliterate the string from unicode characters to ASCII.
'''
# The keys for CODEPOINTS is unicode characters, we want to be sure the
# input text is unicode.
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
# Replace characters larger than 127 with their ASCII equivelent.
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
text)
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
def replace_point(self, codepoint):
'''
@ -87,7 +83,7 @@ class Unidecoder(object):
# Split the unicode character xABCD into parts 0xAB and 0xCD.
# 0xAB represents the group within CODEPOINTS to query and 0xCD
# represents the position in the list of characters for the group.
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
codepoint)]
except:
return '?'
@ -97,12 +93,18 @@ class Unidecoder(object):
Find what group character is a part of.
'''
# Code groups withing CODEPOINTS take the form 'xAB'
return u'x%02x' % (ord(unicode(character)) >> 8)
try:#python2
return 'x%02x' % (ord(unicode(character)) >> 8)
except:
return 'x%02x' % (ord(character) >> 8)
def grouped_point(self, character):
'''
Return the location the replacement character is in the list for a
the group character is a part of.
'''
return ord(unicode(character)) & 255
try:#python2
return ord(unicode(character)) & 255
except:
return ord(character) & 255

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text in Vietnamese.
'''
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
class Vndecoder(Unidecoder):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)

File diff suppressed because it is too large Load Diff

View File

@ -271,11 +271,6 @@ class DeleteAction(InterfaceAction):
partial(self.library_ids_deleted, current_row=row))
# Device view is visible.
else:
if not confirm('<p>'+_('The selected books will be '
'<b>permanently deleted</b> '
'from your device. Are you sure?')
+'</p>', 'device_delete_books', self.gui):
return
if self.gui.stack.currentIndex() == 1:
view = self.gui.memory_view
elif self.gui.stack.currentIndex() == 2:
@ -283,8 +278,14 @@ class DeleteAction(InterfaceAction):
else:
view = self.gui.card_b_view
paths = view.model().paths(rows)
ids = view.model().indices(rows)
if not confirm('<p>'+_('The selected books will be '
'<b>permanently deleted</b> '
'from your device. Are you sure?')
+'</p>', 'device_delete_books', self.gui):
return
job = self.gui.remove_paths(paths)
self.delete_memory[job] = (paths, view.model())
view.model().mark_for_deletion(job, rows)
view.model().mark_for_deletion(job, ids, rows_are_ids=True)
self.gui.status_bar.show_message(_('Deleting books from device.'), 1000)

View File

@ -6,12 +6,12 @@ meaning as possible.
import os
from math import ceil
from calibre.ebooks.unidecode.unidecoder import Unidecoder
from calibre import sanitize_file_name
from calibre.constants import preferred_encoding, iswindows
udc = Unidecoder()
from calibre.utils.localization import get_udc
def ascii_text(orig):
udc = get_udc()
try:
ascii = udc.decode(orig)
except:

View File

@ -169,3 +169,13 @@ def set_qt_translator(translator):
return translator.load(p)
return False
_udc = None
def get_udc():
global _udc
if _udc is None:
from calibre.ebooks.unihandecode import Unihandecoder
_udc = Unihandecoder(lang=get_lang())
return _udc