Modify mac-roman encoding, now go to 10000

This commit is contained in:
Sengian 2011-01-15 16:11:28 +01:00
parent 10c2e603e2
commit 93ef1699df
3 changed files with 33 additions and 29 deletions

View File

@ -237,9 +237,7 @@ class ParseRtf:
check_encoding_obj = check_encoding.CheckEncoding( check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
enc = encode_obj.get_codepage() enc = 'cp' + encode_obj.get_codepage()
if enc != 'mac_roman':
enc = 'cp' + enc
msg = 'Exception in token processing' msg = 'Exception in token processing'
if check_encoding_obj.check_encoding(self.__file, enc): if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \ file_name = self.__file if isinstance(self.__file, str) \

View File

@ -74,9 +74,6 @@ class DefaultEncoding:
if not self.__datafetched: if not self.__datafetched:
self._encoding() self._encoding()
self.__datafetched = True self.__datafetched = True
if self.__platform == 'Macintosh':
code_page = self.__code_page
else:
code_page = 'ansicpg' + self.__code_page code_page = 'ansicpg' + self.__code_page
return self.__platform, code_page, self.__default_num return self.__platform, code_page, self.__default_num
@ -94,49 +91,59 @@ class DefaultEncoding:
def _encoding(self): def _encoding(self):
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
cpfound = False
if not self.__fetchraw: if not self.__fetchraw:
for line in read_obj: for line in read_obj:
self.__token_info = line[:16] self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end': if self.__token_info == 'mi<mk<rtfhed-end':
break break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
self.__code_page = line[20:-1] if int(line[20:-1]) \
else '1252'
if self.__token_info == 'cw<ri<macintosh_': if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh' self.__platform = 'Macintosh'
self.__code_page = 'mac_roman'
elif self.__token_info == 'cw<ri<pc________': elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC' self.__platform = 'IBMPC'
self.__code_page = '437'
elif self.__token_info == 'cw<ri<pca_______': elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2' self.__platform = 'OS/2'
self.__code_page = '850' if self.__token_info == 'cw<ri<ansi-codpg' \
and int(line[20:-1]):
self.__code_page = line[20:-1]
if self.__token_info == 'cw<ri<deflt-font': if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1] self.__default_num = line[20:-1]
cpfound = True
#cw<ri<deflt-font<nu<0 #cw<ri<deflt-font<nu<0
if self.__platform != 'Windows' and \
not cpfound:
if self.__platform == 'Macintosh':
self.__code_page = '10000'
elif self.__platform == 'IBMPC':
self.__code_page = '437'
elif self.__platform == 'OS/2':
self.__code_page = '850'
else: else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+') fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+') fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj: for line in read_obj:
if fenc.search(line):
enc = fenc.search(line).group(1)
if fenccp.search(line): if fenccp.search(line):
cp = fenccp.search(line).group(1) cp = fenccp.search(line).group(1)
if not int(cp): if not int(cp):
self.__code_page = cp self.__code_page = cp
cpfound = True
break break
if fenc.search(line): if self.__platform != 'Windows' and \
enc = fenc.search(line).group(1) not cpfound:
if enc == 'mac': if enc == 'mac':
self.__code_page = 'mac_roman' self.__code_page = '10000'
elif enc == 'pc': elif enc == 'pc':
self.__code_page = '437' self.__code_page = '437'
elif enc == 'pca': elif enc == 'pca':
self.__code_page = '850' self.__code_page = '850'
# if __name__ == '__main__': if __name__ == '__main__':
# encode_obj = DefaultEncoding( encode_obj = DefaultEncoding(
# in_file = sys.argv[1], in_file = sys.argv[1],
# bug_handler = Exception, bug_handler = Exception,
# check_raw = True, check_raw = True,
# ) )
# print encode_obj.get_codepage() print encode_obj.get_codepage()

View File

@ -78,7 +78,6 @@ class ProcessTokens:
'backslash' : ('nu', '\\', self.text_func), 'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func), 'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func), 'cb' : ('nu', '}', self.text_func),
#'line' : ('nu', ' ', self.text_func), calibre
# paragraph formatting => pf # paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func), 'page' : ('pf', 'page-break', self.default_func),
'par' : ('pf', 'par-end___', self.default_func), 'par' : ('pf', 'par-end___', self.default_func),