mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-30 10:15:02 -05:00
189 lines
6.3 KiB
Python
189 lines
6.3 KiB
Python
from __future__ import unicode_literals, absolute_import, print_function, division
|
|
#########################################################################
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
#########################################################################
|
|
|
|
'''
|
|
Codepages as to RTF 1.9.1:
|
|
437 United States IBM
|
|
708 Arabic (ASMO 708)
|
|
709 Arabic (ASMO 449+, BCON V4)
|
|
710 Arabic (transparent Arabic)
|
|
711 Arabic (Nafitha Enhanced)
|
|
720 Arabic (transparent ASMO)
|
|
819 Windows 3.1 (United States and Western Europe)
|
|
850 IBM multilingual
|
|
852 Eastern European
|
|
860 Portuguese
|
|
862 Hebrew
|
|
863 French Canadian
|
|
864 Arabic
|
|
865 Norwegian
|
|
866 Soviet Union
|
|
874 Thai
|
|
932 Japanese
|
|
936 Simplified Chinese
|
|
949 Korean
|
|
950 Traditional Chinese
|
|
1250 Eastern European
|
|
1251 Cyrillic
|
|
1252 Western European
|
|
1253 Greek
|
|
1254 Turkish
|
|
1255 Hebrew
|
|
1256 Arabic
|
|
1257 Baltic
|
|
1258 Vietnamese
|
|
1361 Johab
|
|
10000 MAC Roman
|
|
10001 MAC Japan
|
|
10004 MAC Arabic
|
|
10005 MAC Hebrew
|
|
10006 MAC Greek
|
|
10007 MAC Cyrillic
|
|
10029 MAC Latin2
|
|
10081 MAC Turkish
|
|
57002 Devanagari
|
|
57003 Bengali
|
|
57004 Tamil
|
|
57005 Telugu
|
|
57006 Assamese
|
|
57007 Oriya
|
|
57008 Kannada
|
|
57009 Malayalam
|
|
57010 Gujarati
|
|
57011 Punjabi
|
|
'''
|
|
import re
|
|
from . import open_for_read
|
|
|
|
|
|
class DefaultEncoding:
|
|
"""
|
|
Find the default encoding for the doc
|
|
"""
|
|
|
|
# Note: not all those encoding are really supported by rtf2xml
|
|
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
|
|
# and src\calibre\gui2\widgets.py for the input list in calibre
|
|
ENCODINGS = {
|
|
# Special cases
|
|
'cp1252':'1252',
|
|
'utf-8':'1252',
|
|
'ascii':'1252',
|
|
# Normal cases
|
|
'big5':'950',
|
|
'cp1250':'1250',
|
|
'cp1251':'1251',
|
|
'cp1253':'1253',
|
|
'cp1254':'1254',
|
|
'cp1255':'1255',
|
|
'cp1256':'1256',
|
|
'shift_jis':'932',
|
|
'gb2312':'936',
|
|
# Not in RTF 1.9.1 codepage specification
|
|
'hz':'52936',
|
|
'iso8859_5':'28595',
|
|
'iso2022_jp':'50222',
|
|
'iso2022_kr':'50225',
|
|
'euc_jp':'51932',
|
|
'euc_kr':'51949',
|
|
'gb18030':'54936',
|
|
}
|
|
|
|
def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__platform = 'Windows'
|
|
self.__default_num = 'not-defined'
|
|
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
|
|
self.__datafetched = False
|
|
self.__fetchraw = check_raw
|
|
|
|
def find_default_encoding(self):
|
|
if not self.__datafetched:
|
|
self._encoding()
|
|
self.__datafetched = True
|
|
code_page = 'ansicpg' + self.__code_page
|
|
# if self.__code_page == '10000':
|
|
# self.__code_page = 'mac_roman'
|
|
return self.__platform, code_page, self.__default_num
|
|
|
|
def get_codepage(self):
|
|
if not self.__datafetched:
|
|
self._encoding()
|
|
self.__datafetched = True
|
|
# if self.__code_page == '10000':
|
|
# self.__code_page = 'mac_roman'
|
|
return self.__code_page
|
|
|
|
def get_platform(self):
|
|
if not self.__datafetched:
|
|
self._encoding()
|
|
self.__datafetched = True
|
|
return self.__platform
|
|
|
|
def _encoding(self):
|
|
with open_for_read(self.__file) as read_obj:
|
|
cpfound = False
|
|
if not self.__fetchraw:
|
|
for line in read_obj:
|
|
self.__token_info = line[:16]
|
|
if self.__token_info == 'mi<mk<rtfhed-end':
|
|
break
|
|
if self.__token_info == 'cw<ri<macintosh_':
|
|
self.__platform = 'Macintosh'
|
|
elif self.__token_info == 'cw<ri<pc________':
|
|
self.__platform = 'IBMPC'
|
|
elif self.__token_info == 'cw<ri<pca_______':
|
|
self.__platform = 'OS/2'
|
|
if self.__token_info == 'cw<ri<ansi-codpg' \
|
|
and int(line[20:-1]):
|
|
self.__code_page = line[20:-1]
|
|
if self.__token_info == 'cw<ri<deflt-font':
|
|
self.__default_num = line[20:-1]
|
|
cpfound = True
|
|
# cw<ri<deflt-font<nu<0
|
|
if self.__platform != 'Windows' and \
|
|
not cpfound:
|
|
if self.__platform == 'Macintosh':
|
|
self.__code_page = '10000'
|
|
elif self.__platform == 'IBMPC':
|
|
self.__code_page = '437'
|
|
elif self.__platform == 'OS/2':
|
|
self.__code_page = '850'
|
|
else:
|
|
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
|
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
|
|
|
for line in read_obj:
|
|
if fenc.search(line):
|
|
enc = fenc.search(line).group(1)
|
|
if fenccp.search(line):
|
|
cp = fenccp.search(line).group(1)
|
|
if not int(cp):
|
|
self.__code_page = cp
|
|
cpfound = True
|
|
break
|
|
if self.__platform != 'Windows' and \
|
|
not cpfound:
|
|
if enc == 'mac':
|
|
self.__code_page = '10000'
|
|
elif enc == 'pc':
|
|
self.__code_page = '437'
|
|
elif enc == 'pca':
|
|
self.__code_page = '850'
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
encode_obj = DefaultEncoding(
|
|
in_file=sys.argv[1],
|
|
default_encoding=sys.argv[2],
|
|
bug_handler=Exception,
|
|
check_raw=True,
|
|
)
|
|
print(encode_obj.get_codepage())
|