Allow check encoding to look directly in rtf & improve code checking for invalid chars

This commit is contained in:
Sengian 2011-01-06 22:25:12 +01:00
parent 36845e1c9d
commit bbaecb4007
2 changed files with 59 additions and 29 deletions

View File

@ -230,14 +230,21 @@ class ParseRtf:
os.remove(self.__temp_file)
except OSError:
pass
#Check to see if the file is correctly encoded
#Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
check_raw = True,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException,
)
if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \
check_encoding_obj.check_encoding(self.__file, 'cp437') and \
check_encoding_obj.check_encoding(self.__file, 'cp850') and \
check_encoding_obj.check_encoding(self.__file, 'mac_roman'):
enc = encode_obj.get_codepage()
if enc != 'mac_roman':
enc = 'cp' + enc
if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \
else self.__file.encode('utf-8')
msg = _('File %s does not appear to be correctly encoded.\n') % file_name

View File

@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
57010 Gujarati
57011 Punjabi
'''
import re
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
def __init__(self, in_file, bug_handler, run_level = 1,):
def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
self.__file = in_file
self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = '1252'
self.__datafetched = False
self.__fetchraw = check_raw
def find_default_encoding(self):
if not self.__datafetched:
@ -92,27 +94,48 @@ class DefaultEncoding:
def _encoding(self):
with open(self.__file, 'r') as read_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
self.__code_page = line[20:-1] if line[20:-1] \
else '1252'
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
if self.__platform == 'Macintosh':
self.__code_page = 'mac_roman'
elif self.__platform == 'IBMPC':
self.__code_page = '437'
elif self.__platform == 'OS/2':
self.__code_page = '850'
if not self.__fetchraw:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<ansi-codpg':
#cw<ri<ansi-codpg<nu<10000
self.__code_page = line[20:-1] if line[20:-1] \
else '1252'
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
self.__code_page = 'mac_roman'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
self.__code_page = '437'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
self.__code_page = '850'
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
#cw<ri<deflt-font<nu<0
else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj:
if fenccp.search(line):
self.__code_page = fenccp.search(line).group(1)
break
if fenc.search(line):
enc = fenc.search(line).group(1)
if enc == 'mac':
self.__code_page = 'mac_roman'
elif enc == 'pc':
self.__code_page = '437'
elif enc == 'pca':
self.__code_page = '850'
# if __name__ == '__main__':
# from calibre.ebooks.rtf2xml import default_encoding
# encode_obj = default_encoding.DefaultEncoding(
# in_file = sys.argv[1],
# bug_handler = Exception,
# check_raw = True,
# )
# print encode_obj.get_codepage()