mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
improve filename/path conversion from unihandecode
This commit is contained in:
parent
44ca3ea808
commit
0a22d4af1a
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
|
||||
import functools, re
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.utils.config import prefs
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
@ -523,9 +524,9 @@ class HTMLPreProcessor(object):
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if getattr(self.extra_opts, 'asciiize', False):
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
from calibre.ebooks.unihandecode import Unihandecoder
|
||||
unihandecoder = Unihandecoder(lang=prefs['language'])
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
||||
@ -535,10 +536,10 @@ class HTMLPreProcessor(object):
|
||||
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
from calibre.ebooks.unihandecode import Unihandecoder
|
||||
unihandecoder = Unihandecoder(lang=prefs['language'])
|
||||
for char in unsupported_unicode_chars:
|
||||
asciichar = unidecoder.decode(char)
|
||||
asciichar = unihandecoder.decode(char)
|
||||
html = html.replace(char, asciichar)
|
||||
|
||||
return html
|
||||
|
72
src/calibre/ebooks/unihandecode/__init__.py
Normal file
72
src/calibre/ebooks/unihandecode/__init__.py
Normal file
@ -0,0 +1,72 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__all__ = ["Unihandecoder"]
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text.
|
||||
Translate unicode characters to ASCII.
|
||||
|
||||
inspired from John's unidecode library.
|
||||
Copyright(c) 2009, John Schember
|
||||
|
||||
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||
|
||||
'''
|
||||
|
||||
from unidecoder import Unidecoder
|
||||
from jadecoder import Jadecoder
|
||||
from krdecoder import Krdecoder
|
||||
|
||||
class Unihandecoder(object):
|
||||
preferred_encoding = None
|
||||
lang = None
|
||||
|
||||
def __init__(self, lang="zh", encoding='utf-8'):
|
||||
self.preferred_encoding = encoding
|
||||
self.lang = lang
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
example convert: "明天明天的风吹", "明日は明日の風が吹く"
|
||||
and "내일은 내일 바람이 분다"
|
||||
>>> d = Unihandecoder(lang="zh")
|
||||
>>> print d.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
|
||||
Ming Tian Ming Tian De Feng Chui
|
||||
>>> d = Unihandecoder(lang="ja")
|
||||
>>> print d.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||
Ashita ha Ashita no Kaze ga Fuku
|
||||
>>> d = Unihandecoder(lang="kr")
|
||||
>>> print d.decode(u'\ub0b4\uc77c\uc740 \ub0b4\uc77c \ubc14\ub78c\uc774 \ubd84\ub2e4')
|
||||
naeileun naeil barami bunda
|
||||
|
||||
'''
|
||||
|
||||
if not isinstance(text, unicode):
|
||||
try:
|
||||
text = unicode(text)
|
||||
except:
|
||||
try:
|
||||
text = text.decode(self.preferred_encoding)
|
||||
except:
|
||||
text = text.decode('utf-8', 'replace')
|
||||
|
||||
if self.lang is "ja":
|
||||
d = Jadecoder()
|
||||
return d.decode(text)
|
||||
elif self.lang is "kr":
|
||||
d = Krdecoder()
|
||||
return d.decode(text)
|
||||
else:
|
||||
d = Unidecoder()
|
||||
return d.decode(text)
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
|
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
84
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
84
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
@ -0,0 +1,84 @@
|
||||
# coding:utf8
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text for Japanese.
|
||||
Translate unicode string to ASCII roman string.
|
||||
|
||||
API is based on the python unidecode,
|
||||
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
|
||||
and perl module Text::Unidecode
|
||||
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
|
||||
|
||||
This functionality is owned by Kakasi Japanese processing engine.
|
||||
|
||||
Copyright (c) 2010 Hiroshi Miura
|
||||
'''
|
||||
|
||||
from ctypes import *
|
||||
import os, re
|
||||
from unidecoder import Unidecoder
|
||||
from unicodepoints import CODEPOINTS
|
||||
from jacodepoints import CODEPOINTS as JACODES
|
||||
|
||||
class Jadecoder(Unidecoder):
|
||||
|
||||
#kakasi instance
|
||||
kakasi = None
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(JACODES)
|
||||
|
||||
try:
|
||||
if os.name is "nt":
|
||||
self.kakasi = CDLL("libkakasi")
|
||||
elif os.name is "posix":
|
||||
self.kakasi = CDLL("libkakasi.so")
|
||||
else:
|
||||
self.kakasi = None
|
||||
except:
|
||||
self.kakasi = None
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
Translate the string from unicode characters to ASCII in Japanese.
|
||||
example convert "明日は明日の風が吹く", and "明天明天的风吹"
|
||||
>>> k = Jadecoder()
|
||||
>>> print k.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||
Ashita ha Ashita no Kaze ga Fuku
|
||||
>>> print k.decode(u'\u660e\u5929\u660e\u5929\u7684\u98ce\u5439')
|
||||
MeiTenMeiTenTekiSui
|
||||
'''
|
||||
|
||||
# if there is not kakasi library, we fall down to use unidecode
|
||||
if self.kakasi is None:
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||
|
||||
numopt = 9
|
||||
argArray = c_char_p * numopt
|
||||
args = argArray( c_char_p("kakasi")
|
||||
,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
|
||||
,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
|
||||
,c_char_p("-ieuc")
|
||||
)
|
||||
self.kakasi.kakasi_getopt_argv(numopt, args)
|
||||
kakasi_do = self.kakasi.kakasi_do
|
||||
kakasi_do.restype = c_char_p
|
||||
|
||||
try:
|
||||
cstr = c_char_p(text.encode("eucjp"))
|
||||
return kakasi_do(cstr).decode("eucjp")
|
||||
except:
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
58
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
58
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
@ -0,0 +1,58 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Korean.
|
||||
Based on unidecoder.
|
||||
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecoder import Unidecoder
|
||||
from krcodepoints import CODEPOINTS as HANCODES
|
||||
from unicodepoints import CODEPOINTS
|
||||
|
||||
class Krdecoder(Unidecoder):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
example convert
|
||||
>>> h = Krdecoder()
|
||||
>>> print h.decode(u"내일은 내일 바람이 분다")
|
||||
naeileun naeil barami bunda
|
||||
>>> print h.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||
MyengIlhaMyengIlnoPhwunggaChwiku
|
||||
'''
|
||||
# Replace characters larger than 127 with their ASCII equivelent.
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
||||
text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
Returns the replacement character or ? if none can be found.
|
||||
'''
|
||||
try:
|
||||
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||
# represents the position in the list of characters for the group.
|
||||
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||
codepoint)]
|
||||
except:
|
||||
return '?'
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text. Transliterate
|
||||
unicode characters to ASCII.
|
||||
Decode unicode text to an ASCII representation of the text in Chinese.
|
||||
Transliterate unicode characters to ASCII based on chinese pronounce.
|
||||
|
||||
derived from John's unidecode library.
|
||||
|
||||
Copyright(c) 2009, John Schember
|
||||
|
||||
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
||||
is based on the perl module Text::Unidecode
|
||||
@ -55,29 +59,29 @@ it under the same terms as Perl itself.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
|
||||
from calibre.constants import preferred_encoding
|
||||
from unicodepoints import CODEPOINTS
|
||||
from zhcodepoints import CODEPOINTS as HANCODES
|
||||
|
||||
class Unidecoder(object):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
||||
def decode(self, text):
|
||||
'''
|
||||
Tranliterate the string from unicode characters to ASCII.
|
||||
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||
example convert: "明天明天的风吹" and "明日は明日の風が吹く"
|
||||
>>> u = Unidecoder()
|
||||
>>> print u.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
|
||||
Ming Tian Ming Tian De Feng Chui
|
||||
>>> print u.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||
Ming Ri haMing Ri noFeng gaChui ku
|
||||
'''
|
||||
# The keys for CODEPOINTS is unicode characters, we want to be sure the
|
||||
# input text is unicode.
|
||||
if not isinstance(text, unicode):
|
||||
try:
|
||||
text = unicode(text)
|
||||
except:
|
||||
try:
|
||||
text = text.decode(preferred_encoding)
|
||||
except:
|
||||
text = text.decode('utf-8', 'replace')
|
||||
# Replace characters larger than 127 with their ASCII equivelent.
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
||||
text)
|
||||
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
@ -87,10 +91,10 @@ class Unidecoder(object):
|
||||
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||
# represents the position in the list of characters for the group.
|
||||
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
|
||||
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||
codepoint)]
|
||||
except:
|
||||
return '?'
|
||||
return ''
|
||||
|
||||
def code_group(self, character):
|
||||
'''
|
||||
@ -106,3 +110,10 @@ class Unidecoder(object):
|
||||
'''
|
||||
return ord(unicode(character)) & 255
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
|
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,6 @@ meaning as possible.
|
||||
import os
|
||||
from math import ceil
|
||||
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.constants import preferred_encoding, iswindows
|
||||
udc = Unidecoder()
|
||||
|
Loading…
x
Reference in New Issue
Block a user