Allow check encoding to look directly in rtf & improve code checking for invalid chars

2025-07-09 03:04:10 -04:00 · 2011-01-06 22:25:12 +01:00 · 2011-01-06 22:25:12 +01:00 · bbaecb4007
commit bbaecb4007
parent 36845e1c9d
2 changed files with 59 additions and 29 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -230,14 +230,21 @@ class ParseRtf:
                os.remove(self.__temp_file)
            except OSError:
                pass
-             #Check to see if the file is correctly encoded
+            #Check to see if the file is correctly encoded
+            encode_obj = default_encoding.DefaultEncoding(
+            in_file = self.__temp_file,
+            run_level = self.__run_level,
+            bug_handler = RtfInvalidCodeException,
+            check_raw = True,
+            )
+            platform, code_page, default_font_num = encode_obj.find_default_encoding()
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \
-                    check_encoding_obj.check_encoding(self.__file, 'cp437') and \
-                        check_encoding_obj.check_encoding(self.__file, 'cp850') and \
-                            check_encoding_obj.check_encoding(self.__file, 'mac_roman'):
+            enc = encode_obj.get_codepage()
+            if enc != 'mac_roman':
+                enc = 'cp' + enc
+            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = _('File %s does not appear to be correctly encoded.\n') % file_name 
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1:
    57010	Gujarati
    57011	Punjabi
 '''
+import re

 class DefaultEncoding:
    """
    Find the default encoding for the doc
    """
-    def __init__(self, in_file, bug_handler, run_level = 1,):
+    def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__platform = 'Windows'
        self.__default_num = 'not-defined'
        self.__code_page = '1252'
        self.__datafetched = False
+        self.__fetchraw = check_raw

    def find_default_encoding(self):
        if not self.__datafetched:
@ -92,27 +94,48 @@ class DefaultEncoding:
    
    def _encoding(self):
        with open(self.__file, 'r') as read_obj:
-            for line in read_obj:
-                self.__token_info = line[:16]
-                if self.__token_info == 'mi<mk<rtfhed-end':
-                    break
-                if self.__token_info == 'cw<ri<ansi-codpg':
-                    #cw<ri<ansi-codpg<nu<10000
-                    self.__code_page = line[20:-1] if line[20:-1] \
-                                        else '1252'
-                if self.__token_info == 'cw<ri<macintosh_':
-                    self.__platform = 'Macintosh'
-                elif self.__token_info == 'cw<ri<pc________':
-                    self.__platform = 'IBMPC'
-                elif self.__token_info == 'cw<ri<pca_______':
-                    self.__platform = 'OS/2'
-                if self.__token_info == 'cw<ri<deflt-font':
-                    self.__default_num = line[20:-1]
-                    #cw<ri<deflt-font<nu<0
-        if self.__platform == 'Macintosh':
-            self.__code_page = 'mac_roman'
-        elif self.__platform == 'IBMPC':
-            self.__code_page = '437'
-        elif self.__platform == 'OS/2':
-            self.__code_page = '850'
+            if not self.__fetchraw:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'mi<mk<rtfhed-end':
+                        break
+                    if self.__token_info == 'cw<ri<ansi-codpg':
+                        #cw<ri<ansi-codpg<nu<10000
+                        self.__code_page = line[20:-1] if line[20:-1] \
+                                            else '1252'
+                    if self.__token_info == 'cw<ri<macintosh_':
+                        self.__platform = 'Macintosh'
+                        self.__code_page = 'mac_roman'
+                    elif self.__token_info == 'cw<ri<pc________':
+                        self.__platform = 'IBMPC'
+                        self.__code_page = '437'
+                    elif self.__token_info == 'cw<ri<pca_______':
+                        self.__platform = 'OS/2'
+                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<deflt-font':
+                        self.__default_num = line[20:-1]
+                        #cw<ri<deflt-font<nu<0
+            else:
+                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
+                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+                for line in read_obj:
+                    if fenccp.search(line):
+                        self.__code_page = fenccp.search(line).group(1)
+                        break
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
+                        if enc == 'mac':
+                            self.__code_page = 'mac_roman'
+                        elif enc == 'pc':
+                            self.__code_page = '437'
+                        elif enc == 'pca':
+                            self.__code_page = '850'

+# if __name__ == '__main__':
+    # from calibre.ebooks.rtf2xml import default_encoding
+    # encode_obj = default_encoding.DefaultEncoding(
+            # in_file = sys.argv[1],
+            # bug_handler = Exception,
+            # check_raw = True,
+            # )
+    # print encode_obj.get_codepage()