diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index fdd17e3f78..05a4847ce5 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -230,14 +230,21 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - #Check to see if the file is correctly encoded + #Check to see if the file is correctly encoded + encode_obj = default_encoding.DefaultEncoding( + in_file = self.__temp_file, + run_level = self.__run_level, + bug_handler = RtfInvalidCodeException, + check_raw = True, + ) + platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) - if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \ - check_encoding_obj.check_encoding(self.__file, 'cp437') and \ - check_encoding_obj.check_encoding(self.__file, 'cp850') and \ - check_encoding_obj.check_encoding(self.__file, 'mac_roman'): + enc = encode_obj.get_codepage() + if enc != 'mac_roman': + enc = 'cp' + enc + if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg = _('File %s does not appear to be correctly encoded.\n') % file_name diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index a5c2ab9561..a4eeac9663 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1: 57010 Gujarati 57011 Punjabi ''' +import re class DefaultEncoding: """ Find the default encoding for the doc """ - def __init__(self, in_file, bug_handler, run_level = 1,): + def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False): self.__file = in_file self.__bug_handler = bug_handler self.__platform = 'Windows' self.__default_num = 'not-defined' self.__code_page = '1252' self.__datafetched = False + self.__fetchraw = check_raw def find_default_encoding(self): if not self.__datafetched: @@ -92,27 +94,48 @@ class DefaultEncoding: def _encoding(self): with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - if self.__token_info == 'mi