Clean defaut encoding

This commit is contained in:
Sengian 2011-01-05 23:47:14 +01:00
parent 5784256e02
commit bb50018eb3
2 changed files with 109 additions and 50 deletions

View File

@ -1,61 +1,118 @@
######################################################################### #########################################################################
# # # #
# #
# copyright 2002 Paul Henry Tremblay # # copyright 2002 Paul Henry Tremblay #
# # # #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, write to the Free Software #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
# 02111-1307 USA #
# #
# #
######################################################################### #########################################################################
'''
Codepages as to RTF 1.9.1:
437 United States IBM
708 Arabic (ASMO 708)
709 Arabic (ASMO 449+, BCON V4)
710 Arabic (transparent Arabic)
711 Arabic (Nafitha Enhanced)
720 Arabic (transparent ASMO)
819 Windows 3.1 (United States and Western Europe)
850 IBM multilingual
852 Eastern European
860 Portuguese
862 Hebrew
863 French Canadian
864 Arabic
865 Norwegian
866 Soviet Union
874 Thai
932 Japanese
936 Simplified Chinese
949 Korean
950 Traditional Chinese
1250 Eastern European
1251 Cyrillic
1252 Western European
1253 Greek
1254 Turkish
1255 Hebrew
1256 Arabic
1257 Baltic
1258 Vietnamese
1361 Johab
10000 MAC Roman
10001 MAC Japan
10004 MAC Arabic
10005 MAC Hebrew
10006 MAC Greek
10007 MAC Cyrillic
10029 MAC Latin2
10081 MAC Turkish
57002 Devanagari
57003 Bengali
57004 Tamil
57005 Telugu
57006 Assamese
57007 Oriya
57008 Kannada
57009 Malayalam
57010 Gujarati
57011 Punjabi
'''
class DefaultEncoding: class DefaultEncoding:
""" """
Find the default encoding for the doc Find the default encoding for the doc
""" """
def __init__(self, in_file, bug_handler, run_level = 1,): def __init__(self, in_file, bug_handler, run_level = 1,):
"""
Required:
'file'
Returns:
nothing
"""
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = '1252'
self.__datafetched = False
def find_default_encoding(self): def find_default_encoding(self):
platform = 'Windows' if not self.__datafetched:
default_num = 'not-defined' self._encoding()
code_page = 'ansicpg1252' self.__datafetched = True
read_obj = open(self.__file, 'r') if self.__platform = 'Macintosh':
line_to_read = 1 code_page = self.__code_page
while line_to_read: else
line_to_read = read_obj.readline() code_page = 'ansicpg' + self.__code_page
line = line_to_read return platform, code_page, self.__default_num
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end': def get_codepage(self):
break if not self.__datafetched:
if self.__token_info == 'cw<ri<ansi-codpg': self._encoding()
#cw<ri<ansi-codpg<nu<10000 self.__datafetched = True
num = line[20:-1] return self.__code_page
if not num:
num = '1252' def get_platform(self):
code_page = 'ansicpg' + num if not self.__datafetched:
if self.__token_info == 'cw<ri<macintosh_': self._encoding()
platform = 'Macintosh' self.__datafetched = True
if self.__token_info == 'cw<ri<deflt-font': return self.__platform
default_num = line[20:-1]
#cw<ri<deflt-font<nu<0 def _encoding(self):
#action = self.__state_dict.get(self.__state) with open(self.__file, 'r') as read_obj:
#if action == None: for line in read_obj:
#print self.__state self.__token_info = line[:16]
#action(line) if self.__token_info == 'mi<mk<rtfhed-end':
read_obj.close() break
if platform == 'Macintosh': if self.__token_info == 'cw<ri<ansi-codpg':
code_page = 'mac_roman' #cw<ri<ansi-codpg<nu<10000
return platform, code_page, default_num self.__code_page = line[20:-1] if line[20:-1] \
else '1252'
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
if self.__platform == 'Macintosh':
self.__code_page = 'mac_roman'
elif self.__platform = 'IBMPC':
self.__code_page = '437'
elif self.__platform = 'OS/2':
self.__code_page = '850'

View File

@ -163,15 +163,17 @@ class ProcessTokens:
'rtf' : ('ri', 'rtf_______', self.default_func), 'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func), 'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func), 'mac' : ('ri', 'macintosh_', self.default_func),
'pc' : ('ri', 'pc________', self.default_func),
'pca' : ('ri', 'pca_______', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func), 'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func), 'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt # notes => nt
'footnote' : ('nt', 'footnote__', self.default_func), 'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func), 'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
# anchor => an # anchor => an
'tc' : ('an', 'toc_______', self.default_func), 'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func), 'bkmkstt' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func), 'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func), 'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func), 'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func), 'rxe' : ('an', 'place_____', self.default_func),