Allow check encoding to look directly in rtf & improve code checking for invalid chars

This commit is contained in:
Sengian 2011-01-06 22:25:12 +01:00
parent 36845e1c9d
commit bbaecb4007
2 changed files with 59 additions and 29 deletions

View File

@ -230,14 +230,21 @@ class ParseRtf:
os.remove(self.__temp_file) os.remove(self.__temp_file)
except OSError: except OSError:
pass pass
#Check to see if the file is correctly encoded #Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file,
run_level = self.__run_level,
bug_handler = RtfInvalidCodeException,
check_raw = True,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding( check_encoding_obj = check_encoding.CheckEncoding(
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,
) )
if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \ enc = encode_obj.get_codepage()
check_encoding_obj.check_encoding(self.__file, 'cp437') and \ if enc != 'mac_roman':
check_encoding_obj.check_encoding(self.__file, 'cp850') and \ enc = 'cp' + enc
check_encoding_obj.check_encoding(self.__file, 'mac_roman'): if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \ file_name = self.__file if isinstance(self.__file, str) \
else self.__file.encode('utf-8') else self.__file.encode('utf-8')
msg = _('File %s does not appear to be correctly encoded.\n') % file_name msg = _('File %s does not appear to be correctly encoded.\n') % file_name

View File

@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
57010 Gujarati 57010 Gujarati
57011 Punjabi 57011 Punjabi
''' '''
import re
class DefaultEncoding: class DefaultEncoding:
""" """
Find the default encoding for the doc Find the default encoding for the doc
""" """
def __init__(self, in_file, bug_handler, run_level = 1,): def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__platform = 'Windows' self.__platform = 'Windows'
self.__default_num = 'not-defined' self.__default_num = 'not-defined'
self.__code_page = '1252' self.__code_page = '1252'
self.__datafetched = False self.__datafetched = False
self.__fetchraw = check_raw
def find_default_encoding(self): def find_default_encoding(self):
if not self.__datafetched: if not self.__datafetched:
@ -92,27 +94,48 @@ class DefaultEncoding:
def _encoding(self): def _encoding(self):
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
for line in read_obj: if not self.__fetchraw:
self.__token_info = line[:16] for line in read_obj:
if self.__token_info == 'mi<mk<rtfhed-end': self.__token_info = line[:16]
break if self.__token_info == 'mi<mk<rtfhed-end':
if self.__token_info == 'cw<ri<ansi-codpg': break
#cw<ri<ansi-codpg<nu<10000 if self.__token_info == 'cw<ri<ansi-codpg':
self.__code_page = line[20:-1] if line[20:-1] \ #cw<ri<ansi-codpg<nu<10000
else '1252' self.__code_page = line[20:-1] if line[20:-1] \
if self.__token_info == 'cw<ri<macintosh_': else '1252'
self.__platform = 'Macintosh' if self.__token_info == 'cw<ri<macintosh_':
elif self.__token_info == 'cw<ri<pc________': self.__platform = 'Macintosh'
self.__platform = 'IBMPC' self.__code_page = 'mac_roman'
elif self.__token_info == 'cw<ri<pca_______': elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'OS/2' self.__platform = 'IBMPC'
if self.__token_info == 'cw<ri<deflt-font': self.__code_page = '437'
self.__default_num = line[20:-1] elif self.__token_info == 'cw<ri<pca_______':
#cw<ri<deflt-font<nu<0 self.__platform = 'OS/2'
if self.__platform == 'Macintosh': self.__code_page = '850'
self.__code_page = 'mac_roman' if self.__token_info == 'cw<ri<deflt-font':
elif self.__platform == 'IBMPC': self.__default_num = line[20:-1]
self.__code_page = '437' #cw<ri<deflt-font<nu<0
elif self.__platform == 'OS/2': else:
self.__code_page = '850' fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj:
if fenccp.search(line):
self.__code_page = fenccp.search(line).group(1)
break
if fenc.search(line):
enc = fenc.search(line).group(1)
if enc == 'mac':
self.__code_page = 'mac_roman'
elif enc == 'pc':
self.__code_page = '437'
elif enc == 'pca':
self.__code_page = '850'
# if __name__ == '__main__':
# from calibre.ebooks.rtf2xml import default_encoding
# encode_obj = default_encoding.DefaultEncoding(
# in_file = sys.argv[1],
# bug_handler = Exception,
# check_raw = True,
# )
# print encode_obj.get_codepage()