mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
improve filename/path conversion from unihandecode
This commit is contained in:
parent
44ca3ea808
commit
0a22d4af1a
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import functools, re
|
import functools, re
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
|
from calibre.utils.config import prefs
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
@ -523,9 +524,9 @@ class HTMLPreProcessor(object):
|
|||||||
html = XMLDECL_RE.sub('', html)
|
html = XMLDECL_RE.sub('', html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'asciiize', False):
|
if getattr(self.extra_opts, 'asciiize', False):
|
||||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
from calibre.ebooks.unihandecode import Unihandecoder
|
||||||
unidecoder = Unidecoder()
|
unihandecoder = Unihandecoder(lang=prefs['language'])
|
||||||
html = unidecoder.decode(html)
|
html = unihandecoder.decode(html)
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
if self.plugin_preprocess:
|
||||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
html = self.input_plugin_preprocess(self.extra_opts, html)
|
||||||
@ -535,10 +536,10 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||||
if unsupported_unicode_chars:
|
if unsupported_unicode_chars:
|
||||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
from calibre.ebooks.unihandecode import Unihandecoder
|
||||||
unidecoder = Unidecoder()
|
unihandecoder = Unihandecoder(lang=prefs['language'])
|
||||||
for char in unsupported_unicode_chars:
|
for char in unsupported_unicode_chars:
|
||||||
asciichar = unidecoder.decode(char)
|
asciichar = unihandecoder.decode(char)
|
||||||
html = html.replace(char, asciichar)
|
html = html.replace(char, asciichar)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
72
src/calibre/ebooks/unihandecode/__init__.py
Normal file
72
src/calibre/ebooks/unihandecode/__init__.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
__all__ = ["Unihandecoder"]
|
||||||
|
|
||||||
|
'''
|
||||||
|
Decode unicode text to an ASCII representation of the text.
|
||||||
|
Translate unicode characters to ASCII.
|
||||||
|
|
||||||
|
inspired from John's unidecode library.
|
||||||
|
Copyright(c) 2009, John Schember
|
||||||
|
|
||||||
|
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
from unidecoder import Unidecoder
|
||||||
|
from jadecoder import Jadecoder
|
||||||
|
from krdecoder import Krdecoder
|
||||||
|
|
||||||
|
class Unihandecoder(object):
|
||||||
|
preferred_encoding = None
|
||||||
|
lang = None
|
||||||
|
|
||||||
|
def __init__(self, lang="zh", encoding='utf-8'):
|
||||||
|
self.preferred_encoding = encoding
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
def decode(self, text):
|
||||||
|
'''
|
||||||
|
example convert: "明天明天的风吹", "明日は明日の風が吹く"
|
||||||
|
and "내일은 내일 바람이 분다"
|
||||||
|
>>> d = Unihandecoder(lang="zh")
|
||||||
|
>>> print d.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
|
||||||
|
Ming Tian Ming Tian De Feng Chui
|
||||||
|
>>> d = Unihandecoder(lang="ja")
|
||||||
|
>>> print d.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||||
|
Ashita ha Ashita no Kaze ga Fuku
|
||||||
|
>>> d = Unihandecoder(lang="kr")
|
||||||
|
>>> print d.decode(u'\ub0b4\uc77c\uc740 \ub0b4\uc77c \ubc14\ub78c\uc774 \ubd84\ub2e4')
|
||||||
|
naeileun naeil barami bunda
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
if not isinstance(text, unicode):
|
||||||
|
try:
|
||||||
|
text = unicode(text)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
text = text.decode(self.preferred_encoding)
|
||||||
|
except:
|
||||||
|
text = text.decode('utf-8', 'replace')
|
||||||
|
|
||||||
|
if self.lang is "ja":
|
||||||
|
d = Jadecoder()
|
||||||
|
return d.decode(text)
|
||||||
|
elif self.lang is "kr":
|
||||||
|
d = Krdecoder()
|
||||||
|
return d.decode(text)
|
||||||
|
else:
|
||||||
|
d = Unidecoder()
|
||||||
|
return d.decode(text)
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
||||||
|
|
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/jacodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
84
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
84
src/calibre/ebooks/unihandecode/jadecoder.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# coding:utf8
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Decode unicode text to an ASCII representation of the text for Japanese.
|
||||||
|
Translate unicode string to ASCII roman string.
|
||||||
|
|
||||||
|
API is based on the python unidecode,
|
||||||
|
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
|
||||||
|
and perl module Text::Unidecode
|
||||||
|
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
|
||||||
|
|
||||||
|
This functionality is owned by Kakasi Japanese processing engine.
|
||||||
|
|
||||||
|
Copyright (c) 2010 Hiroshi Miura
|
||||||
|
'''
|
||||||
|
|
||||||
|
from ctypes import *
|
||||||
|
import os, re
|
||||||
|
from unidecoder import Unidecoder
|
||||||
|
from unicodepoints import CODEPOINTS
|
||||||
|
from jacodepoints import CODEPOINTS as JACODES
|
||||||
|
|
||||||
|
class Jadecoder(Unidecoder):
|
||||||
|
|
||||||
|
#kakasi instance
|
||||||
|
kakasi = None
|
||||||
|
|
||||||
|
codepoints = {}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.codepoints = CODEPOINTS
|
||||||
|
self.codepoints.update(JACODES)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.name is "nt":
|
||||||
|
self.kakasi = CDLL("libkakasi")
|
||||||
|
elif os.name is "posix":
|
||||||
|
self.kakasi = CDLL("libkakasi.so")
|
||||||
|
else:
|
||||||
|
self.kakasi = None
|
||||||
|
except:
|
||||||
|
self.kakasi = None
|
||||||
|
|
||||||
|
def decode(self, text):
|
||||||
|
'''
|
||||||
|
Translate the string from unicode characters to ASCII in Japanese.
|
||||||
|
example convert "明日は明日の風が吹く", and "明天明天的风吹"
|
||||||
|
>>> k = Jadecoder()
|
||||||
|
>>> print k.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||||
|
Ashita ha Ashita no Kaze ga Fuku
|
||||||
|
>>> print k.decode(u'\u660e\u5929\u660e\u5929\u7684\u98ce\u5439')
|
||||||
|
MeiTenMeiTenTekiSui
|
||||||
|
'''
|
||||||
|
|
||||||
|
# if there is not kakasi library, we fall down to use unidecode
|
||||||
|
if self.kakasi is None:
|
||||||
|
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||||
|
|
||||||
|
numopt = 9
|
||||||
|
argArray = c_char_p * numopt
|
||||||
|
args = argArray( c_char_p("kakasi")
|
||||||
|
,c_char_p("-Ja"),c_char_p("-Ha"),c_char_p("-Ka"),c_char_p("-Ea")
|
||||||
|
,c_char_p("-ka"),c_char_p("-C"),c_char_p("-s")
|
||||||
|
,c_char_p("-ieuc")
|
||||||
|
)
|
||||||
|
self.kakasi.kakasi_getopt_argv(numopt, args)
|
||||||
|
kakasi_do = self.kakasi.kakasi_do
|
||||||
|
kakasi_do.restype = c_char_p
|
||||||
|
|
||||||
|
try:
|
||||||
|
cstr = c_char_p(text.encode("eucjp"))
|
||||||
|
return kakasi_do(cstr).decode("eucjp")
|
||||||
|
except:
|
||||||
|
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/krcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
58
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
58
src/calibre/ebooks/unihandecode/krdecoder.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Decode unicode text to an ASCII representation of the text in Korean.
|
||||||
|
Based on unidecoder.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from unidecoder import Unidecoder
|
||||||
|
from krcodepoints import CODEPOINTS as HANCODES
|
||||||
|
from unicodepoints import CODEPOINTS
|
||||||
|
|
||||||
|
class Krdecoder(Unidecoder):
|
||||||
|
|
||||||
|
codepoints = {}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.codepoints = CODEPOINTS
|
||||||
|
self.codepoints.update(HANCODES)
|
||||||
|
|
||||||
|
def decode(self, text):
|
||||||
|
'''
|
||||||
|
example convert
|
||||||
|
>>> h = Krdecoder()
|
||||||
|
>>> print h.decode(u"내일은 내일 바람이 분다")
|
||||||
|
naeileun naeil barami bunda
|
||||||
|
>>> print h.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||||
|
MyengIlhaMyengIlnoPhwunggaChwiku
|
||||||
|
'''
|
||||||
|
# Replace characters larger than 127 with their ASCII equivelent.
|
||||||
|
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
||||||
|
text)
|
||||||
|
|
||||||
|
def replace_point(self, codepoint):
|
||||||
|
'''
|
||||||
|
Returns the replacement character or ? if none can be found.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||||
|
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||||
|
# represents the position in the list of characters for the group.
|
||||||
|
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||||
|
codepoint)]
|
||||||
|
except:
|
||||||
|
return '?'
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,16 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Decode unicode text to an ASCII representation of the text. Transliterate
|
Decode unicode text to an ASCII representation of the text in Chinese.
|
||||||
unicode characters to ASCII.
|
Transliterate unicode characters to ASCII based on chinese pronounce.
|
||||||
|
|
||||||
|
derived from John's unidecode library.
|
||||||
|
|
||||||
|
Copyright(c) 2009, John Schember
|
||||||
|
|
||||||
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
||||||
is based on the perl module Text::Unidecode
|
is based on the perl module Text::Unidecode
|
||||||
@ -55,29 +59,29 @@ it under the same terms as Perl itself.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from unicodepoints import CODEPOINTS
|
||||||
from calibre.ebooks.unidecode.unicodepoints import CODEPOINTS
|
from zhcodepoints import CODEPOINTS as HANCODES
|
||||||
from calibre.constants import preferred_encoding
|
|
||||||
|
|
||||||
class Unidecoder(object):
|
class Unidecoder(object):
|
||||||
|
|
||||||
|
codepoints = {}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.codepoints = CODEPOINTS
|
||||||
|
self.codepoints.update(HANCODES)
|
||||||
|
|
||||||
def decode(self, text):
|
def decode(self, text):
|
||||||
'''
|
'''
|
||||||
Tranliterate the string from unicode characters to ASCII.
|
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||||
|
example convert: "明天明天的风吹" and "明日は明日の風が吹く"
|
||||||
|
>>> u = Unidecoder()
|
||||||
|
>>> print u.decode(u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439")
|
||||||
|
Ming Tian Ming Tian De Feng Chui
|
||||||
|
>>> print u.decode(u'\u660e\u65e5\u306f\u660e\u65e5\u306e\u98a8\u304c\u5439\u304f')
|
||||||
|
Ming Ri haMing Ri noFeng gaChui ku
|
||||||
'''
|
'''
|
||||||
# The keys for CODEPOINTS is unicode characters, we want to be sure the
|
|
||||||
# input text is unicode.
|
|
||||||
if not isinstance(text, unicode):
|
|
||||||
try:
|
|
||||||
text = unicode(text)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
text = text.decode(preferred_encoding)
|
|
||||||
except:
|
|
||||||
text = text.decode('utf-8', 'replace')
|
|
||||||
# Replace characters larger than 127 with their ASCII equivelent.
|
# Replace characters larger than 127 with their ASCII equivelent.
|
||||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),
|
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
|
||||||
text)
|
|
||||||
|
|
||||||
def replace_point(self, codepoint):
|
def replace_point(self, codepoint):
|
||||||
'''
|
'''
|
||||||
@ -87,10 +91,10 @@ class Unidecoder(object):
|
|||||||
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||||
# represents the position in the list of characters for the group.
|
# represents the position in the list of characters for the group.
|
||||||
return CODEPOINTS[self.code_group(codepoint)][self.grouped_point(
|
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||||
codepoint)]
|
codepoint)]
|
||||||
except:
|
except:
|
||||||
return '?'
|
return ''
|
||||||
|
|
||||||
def code_group(self, character):
|
def code_group(self, character):
|
||||||
'''
|
'''
|
||||||
@ -106,3 +110,10 @@ class Unidecoder(object):
|
|||||||
'''
|
'''
|
||||||
return ord(unicode(character)) & 255
|
return ord(unicode(character)) & 255
|
||||||
|
|
||||||
|
def _test():
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_test()
|
||||||
|
|
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/vncodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
5251
src/calibre/ebooks/unihandecode/zhcodepoints.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,6 @@ meaning as possible.
|
|||||||
import os
|
import os
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
|
||||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
|
||||||
from calibre import sanitize_file_name
|
from calibre import sanitize_file_name
|
||||||
from calibre.constants import preferred_encoding, iswindows
|
from calibre.constants import preferred_encoding, iswindows
|
||||||
udc = Unidecoder()
|
udc = Unidecoder()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user