Allow check encoding to look directly in rtf & improve code checking for invalid chars

2025-08-30 23:00:21 -04:00 · 2011-01-06 22:25:12 +01:00 · 2011-01-06 22:25:12 +01:00 · bbaecb4007
commit bbaecb4007
parent 36845e1c9d
2 changed files with 59 additions and 29 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -230,14 +230,21 @@ class ParseRtf:
                os.remove(self.__temp_file)
            except OSError:
                pass
-             #Check to see if the file is correctly encoded
+            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
            run_level = self.__run_level,
            bug_handler = RtfInvalidCodeException,
            check_raw = True,
            )
            platform, code_page, default_font_num = encode_obj.find_default_encoding()
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \
+            enc = encode_obj.get_codepage()
-                    check_encoding_obj.check_encoding(self.__file, 'cp437') and \
+            if enc != 'mac_roman':
-                        check_encoding_obj.check_encoding(self.__file, 'cp850') and \
+                enc = 'cp' + enc
-                            check_encoding_obj.check_encoding(self.__file, 'mac_roman'):
+            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = _('File %s does not appear to be correctly encoded.\n') % file_name 
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
    57010	Gujarati
    57011	Punjabi
 '''
 import re
 class DefaultEncoding:
    """
    Find the default encoding for the doc
    """
-    def __init__(self, in_file, bug_handler, run_level = 1,):
+    def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__platform = 'Windows'
        self.__default_num = 'not-defined'
        self.__code_page = '1252'
        self.__datafetched = False
        self.__fetchraw = check_raw
    def find_default_encoding(self):
        if not self.__datafetched:
@ -92,27 +94,48 @@ class DefaultEncoding:
    def _encoding(self):
        with open(self.__file, 'r') as read_obj:
-            for line in read_obj:
+            if not self.__fetchraw:
-                self.__token_info = line[:16]
+                for line in read_obj:
-                if self.__token_info == 'mi<mk<rtfhed-end':
+                    self.__token_info = line[:16]
-                    break
+                    if self.__token_info == 'mi<mk<rtfhed-end':
-                if self.__token_info == 'cw<ri<ansi-codpg':
+                        break
-                    #cw<ri<ansi-codpg<nu<10000
+                    if self.__token_info == 'cw<ri<ansi-codpg':
-                    self.__code_page = line[20:-1] if line[20:-1] \
+                        #cw<ri<ansi-codpg<nu<10000
-                                        else '1252'
+                        self.__code_page = line[20:-1] if line[20:-1] \
-                if self.__token_info == 'cw<ri<macintosh_':
+                                            else '1252'
-                    self.__platform = 'Macintosh'
+                    if self.__token_info == 'cw<ri<macintosh_':
-                elif self.__token_info == 'cw<ri<pc________':
+                        self.__platform = 'Macintosh'
-                    self.__platform = 'IBMPC'
+                        self.__code_page = 'mac_roman'
-                elif self.__token_info == 'cw<ri<pca_______':
+                    elif self.__token_info == 'cw<ri<pc________':
-                    self.__platform = 'OS/2'
+                        self.__platform = 'IBMPC'
-                if self.__token_info == 'cw<ri<deflt-font':
+                        self.__code_page = '437'
-                    self.__default_num = line[20:-1]
+                    elif self.__token_info == 'cw<ri<pca_______':
-                    #cw<ri<deflt-font<nu<0
+                        self.__platform = 'OS/2'
-        if self.__platform == 'Macintosh':
+                        self.__code_page = '850'
-            self.__code_page = 'mac_roman'
+                    if self.__token_info == 'cw<ri<deflt-font':
-        elif self.__platform == 'IBMPC':
+                        self.__default_num = line[20:-1]
-            self.__code_page = '437'
+                        #cw<ri<deflt-font<nu<0
-        elif self.__platform == 'OS/2':
+            else:
-            self.__code_page = '850'
+                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
                for line in read_obj:
                    if fenccp.search(line):
                        self.__code_page = fenccp.search(line).group(1)
                        break
                    if fenc.search(line):
                        enc = fenc.search(line).group(1)
                        if enc == 'mac':
                            self.__code_page = 'mac_roman'
                        elif enc == 'pc':
                            self.__code_page = '437'
                        elif enc == 'pca':
                            self.__code_page = '850'
 # if __name__ == '__main__':
    # from calibre.ebooks.rtf2xml import default_encoding
    # encode_obj = default_encoding.DefaultEncoding(
            # in_file = sys.argv[1],
            # bug_handler = Exception,
            # check_raw = True,
            # )
    # print encode_obj.get_codepage()