improve filename/path conversion from unihandecode

This commit is contained in:
Hiroshi Miura 2010-12-25 23:55:07 +09:00
parent 44ca3ea808
commit 0a22d4af1a
12 changed files with 21330 additions and 1560 deletions

View File

@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
import functools, re import functools, re
from calibre import entity_to_unicode from calibre import entity_to_unicode
from calibre.utils.config import prefs
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg' SVG_NS = 'http://www.w3.org/2000/svg'
@ -523,9 +524,9 @@ class HTMLPreProcessor(object):
html = XMLDECL_RE.sub('', html) html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False): if getattr(self.extra_opts, 'asciiize', False):
from calibre.ebooks.unidecode.unidecoder import Unidecoder from calibre.ebooks.unihandecode import Unihandecoder
unidecoder = Unidecoder() unihandecoder = Unihandecoder(lang=prefs['language'])
html = unidecoder.decode(html) html = unihandecoder.decode(html)
if self.plugin_preprocess: if self.plugin_preprocess:
html = self.input_plugin_preprocess(self.extra_opts, html) html = self.input_plugin_preprocess(self.extra_opts, html)
@ -535,10 +536,10 @@ class HTMLPreProcessor(object):
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars: if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder from calibre.ebooks.unihandecode import Unihandecoder
unidecoder = Unidecoder() unihandecoder = Unihandecoder(lang=prefs['language'])
for char in unsupported_unicode_chars: for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char) asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar) html = html.replace(char, asciichar)
return html return html

View File

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
__all__ = ["Unihandecoder"]
'''
Decode unicode text to an ASCII representation of the text.
Translate unicode characters to ASCII.
inspired from John's unidecode library.
Copyright(c) 2009, John Schember
Tranliterate the string from unicode characters to ASCII in Chinese and others.
'''
from unidecoder import Unidecoder
from jadecoder import Jadecoder
from krdecoder import Krdecoder
class Unihandecoder(object):
preferred_encoding = None
lang = None
def __init__(self, lang="zh", encoding='utf-8'):
self.preferred_encoding = encoding
self.lang = lang
def decode(self, text):
'''
example convert: "明天明天的风吹", "明日は明日の風が吹く"
and "내일은 내일 바람이 분다"
>>> d = Unihandecoder(lang="zh")
>>> print d.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
Ming Tian Ming Tian De Feng Chui
>>> d = Unihandecoder(lang="ja")
>>> print d.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
Ashita ha Ashita no Kaze ga Fuku
>>> d = Unihandecoder(lang="kr")
>>> print d.decode(u'\ub0b4\uc77c\uc740 \ub0b4\uc77c \ubc14\ub78c\uc774 \ubd84\ub2e4')
naeileun naeil barami bunda
'''
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(self.preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
if self.lang is "ja":
d = Jadecoder()
return d.decode(text)
elif self.lang is "kr":
d = Krdecoder()
return d.decode(text)
else:
d = Unidecoder()
return d.decode(text)
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,84 @@
# coding:utf8
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text for Japanese.
Translate unicode string to ASCII roman string.
API is based on the python unidecode,
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
and perl module Text::Unidecode
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
This functionality is owned by Kakasi Japanese processing engine.
Copyright (c) 2010 Hiroshi Miura
'''
from ctypes import *
import os, re
from unidecoder import Unidecoder
from unicodepoints import CODEPOINTS
from jacodepoints import CODEPOINTS as JACODES
class Jadecoder(Unidecoder):
#kakasi instance
kakasi = None
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(JACODES)
try:
if os.name is "nt":
self.kakasi = CDLL("libkakasi")
elif os.name is "posix":
self.kakasi = CDLL("libkakasi.so")
else:
self.kakasi = None
except:
self.kakasi = None
def decode(self, text):
'''
Translate the string from unicode characters to ASCII in Japanese.
example convert "明日は明日の風が吹く", and "明天明天的风吹"
>>> k = Jadecoder()
>>> print k.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
Ashita ha Ashita no Kaze ga Fuku
>>> print k.decode(u'\u660e\u5929\u660e\u5929\u7684\u98ce\u5439')
MeiTenMeiTenTekiSui
'''
# if there is not kakasi library, we fall down to use unidecode
if self.kakasi is None:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
numopt = 9
argArray = c_char_p * numopt
args = argArray( c_char_p("kakasi")
,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
,c_char_p("-ieuc")
)
self.kakasi.kakasi_getopt_argv(numopt, args)
kakasi_do = self.kakasi.kakasi_do
kakasi_do.restype = c_char_p
try:
cstr = c_char_p(text.encode("eucjp"))
return kakasi_do(cstr).decode("eucjp")
except:
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en'
'''
Decode unicode text to an ASCII representation of the text in Korean.
Based on unidecoder.
'''
import re
from unidecoder import Unidecoder
from krcodepoints import CODEPOINTS as HANCODES
from unicodepoints import CODEPOINTS
class Krdecoder(Unidecoder):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)
def decode(self, text):
'''
example convert
>>> h = Krdecoder()
>>> print h.decode(u"내일은 내일 바람이 분다")
naeileun naeil barami bunda
>>> print h.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
MyengIlhaMyengIlnoPhwunggaChwiku
'''
# Replace characters larger than 127 with their ASCII equivelent.
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
text)
def replace_point(self, codepoint):
'''
Returns the replacement character or ? if none can be found.
'''
try:
# Split the unicode character xABCD into parts 0xAB and 0xCD.
# 0xAB represents the group within CODEPOINTS to query and 0xCD
# represents the position in the list of characters for the group.
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
codepoint)]
except:
return '?'
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()

View File

@ -1,12 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' '''
Decode unicode text to an ASCII representation of the text. Transliterate Decode unicode text to an ASCII representation of the text in Chinese.
unicode characters to ASCII. Transliterate unicode characters to ASCII based on chinese pronounce.
derived from John's unidecode library.
Copyright(c) 2009, John Schember
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
is based on the perl module Text::Unidecode is based on the perl module Text::Unidecode
@ -55,29 +59,29 @@ it under the same terms as Perl itself.
''' '''
import re import re
from unicodepoints import CODEPOINTS
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS from zhcodepoints import CODEPOINTS as HANCODES
from calibre.constants import preferred_encoding
class Unidecoder(object): class Unidecoder(object):
codepoints = {}
def __init__(self):
self.codepoints = CODEPOINTS
self.codepoints.update(HANCODES)
def decode(self, text): def decode(self, text):
''' '''
Tranliterate the string from unicode characters to ASCII. Tranliterate the string from unicode characters to ASCII in Chinese and others.
example convert: "明天明天的风吹" and "明日は明日の風が吹く"
>>> u = Unidecoder()
>>> print u.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
Ming Tian Ming Tian De Feng Chui
>>> print u.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
Ming Ri haMing Ri noFeng gaChui ku
''' '''
# The keys for CODEPOINTS is unicode characters, we want to be sure the
# input text is unicode.
if not isinstance(text, unicode):
try:
text = unicode(text)
except:
try:
text = text.decode(preferred_encoding)
except:
text = text.decode('utf-8', 'replace')
# Replace characters larger than 127 with their ASCII equivelent. # Replace characters larger than 127 with their ASCII equivelent.
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()), return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
text)
def replace_point(self, codepoint): def replace_point(self, codepoint):
''' '''
@ -87,10 +91,10 @@ class Unidecoder(object):
# Split the unicode character xABCD into parts 0xAB and 0xCD. # Split the unicode character xABCD into parts 0xAB and 0xCD.
# 0xAB represents the group within CODEPOINTS to query and 0xCD # 0xAB represents the group within CODEPOINTS to query and 0xCD
# represents the position in the list of characters for the group. # represents the position in the list of characters for the group.
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point( return self.codepoints[self.code_group(codepoint)][self.grouped_point(
codepoint)] codepoint)]
except: except:
return '?' return ''
def code_group(self, character): def code_group(self, character):
''' '''
@ -106,3 +110,10 @@ class Unidecoder(object):
''' '''
return ord(unicode(character)) & 255 return ord(unicode(character)) & 255
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,6 @@ meaning as possible.
import os import os
from math import ceil from math import ceil
from calibre.ebooks.unidecode.unidecoder import Unidecoder
from calibre import sanitize_file_name from calibre import sanitize_file_name
from calibre.constants import preferred_encoding, iswindows from calibre.constants import preferred_encoding, iswindows
udc = Unidecoder() udc = Unidecoder()