Handle non ascii charset in RTF if declared as codepage

2025-07-09 03:04:10 -04:00 · 2011-01-07 08:07:39 +01:00 · 2011-01-07 08:07:39 +01:00 · ac07ff853e
commit ac07ff853e
parent b2187360ec
4 changed files with 41 additions and 15 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -326,7 +326,6 @@ class ParseRtf:
                invalid_rtf_handler = InvalidRtfException,
                )
        hex2utf_obj.convert_hex_2_utf8()
-        # raise RtfInvalidCodeException, 'stop'
        self.__bracket_match('hex_2_utf_preamble')
        fonts_obj = fonts.Fonts(
            in_file = self.__temp_file,
@ -523,6 +522,7 @@ class ParseRtf:
                indent = self.__indent,
                run_level = self.__run_level,
                no_dtd = self.__no_dtd,
+                encoding = encode_obj.get_codepage(),
                bug_handler = RtfInvalidCodeException,
                )
        tags_obj.convert_to_tags()
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import sys
+
 class CheckEncoding:

    def __init__(self, bug_handler):
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@ -1,6 +1,9 @@
 import os, tempfile
-from calibre.ebooks.rtf2xml import copy
+
+from calibre.ebooks.rtf2xml import copy, check_encoding
+
 public_dtd = 'rtf2xml1.0.dtd'
+
 class ConvertToTags:
    """
    Convert file to XML
@ -10,6 +13,7 @@ class ConvertToTags:
            bug_handler,
            dtd_path,
            no_dtd,
+            encoding,
            indent = None,
            copy = None,
            run_level = 1,
@ -29,9 +33,14 @@ class ConvertToTags:
        self.__copy = copy
        self.__dtd_path = dtd_path
        self.__no_dtd = no_dtd
+        if encoding != 'mac_roman':
+            self.__encoding = 'cp' + encoding
+        else:
+            self.__encoding = 'mac_roman'
        self.__indent = indent
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def __initiate_values(self):
        """
        Set values, including those for the dictionary.
@ -61,6 +70,7 @@ class ConvertToTags:
        'tx<ut<__________'  :   self.__text_func,
        'mi<tg<empty_____'  :   self.__empty_func,
        }
+
    def __open_func(self, line):
        """
        Print the opening tag and newlines when needed.
@ -73,6 +83,7 @@ class ConvertToTags:
        if info in self.__two_new_line:
            self.__write_extra_new_line()
        self.__write_obj.write('<%s>' % info)
+
    def __empty_func(self, line):
        """
        Print out empty tag and newlines when needed.
@ -85,6 +96,7 @@ class ConvertToTags:
            self.__write_new_line()
        if info in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __open_att_func(self, line):
        """
        Process lines for open tags that have attributes.
@ -119,6 +131,7 @@ class ConvertToTags:
            self.__write_new_line()
        if element_name in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __empty_att_func(self, line):
        """
        Same as the __open_att_func, except a '/' is placed at the end of the tag.
@ -143,6 +156,7 @@ class ConvertToTags:
            self.__write_new_line()
        if element_name in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __close_func(self, line):
        """
        Print out the closed tag and new lines, if appropriate.
@ -156,6 +170,7 @@ class ConvertToTags:
            self.__write_new_line()
        if info in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __text_func(self, line):
        """
        Simply print out the information between [17:-1]
@ -163,6 +178,7 @@ class ConvertToTags:
        #tx<nu<__________<Normal;
        # change this!
        self.__write_obj.write(line[17:-1])
+
    def __write_extra_new_line(self):
        """
        Print out extra new lines if the new lines have not exceeded two. If
@ -172,8 +188,10 @@ class ConvertToTags:
            return
        if self.__new_line < 2:
            self.__write_obj.write('\n')
+
    def __default_func(self, line):
        pass
+
    def __write_new_line(self):
        """
        Print out a new line if a new line has not already been printed out.
@ -183,11 +201,22 @@ class ConvertToTags:
        if not self.__new_line:
            self.__write_obj.write('\n')
            self.__new_line += 1
+
    def __write_dec(self):
        """
        Write the XML declaration at the top of the document.
        """
-        self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        #keep maximum compatibility with previous version
+        check_encoding_obj = check_encoding.CheckEncoding(
+                    bug_handler = self.__bug_handler,
+                        )
+        if not check_encoding_obj.check_encoding(self.__file):
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+            self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
+        else:
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+            sys.stderr.write(_('Bad RTF encoding, revert to US-ASCII chars and hope for the best'))
        self.__new_line = 0
        self.__write_new_line()
        if self.__no_dtd:
@ -207,6 +236,7 @@ class ConvertToTags:
            )
        self.__new_line = 0
        self.__write_new_line()
+
    def convert_to_tags(self):
        """
        Read in the file one line at a time. Get the important info, between
@ -222,18 +252,14 @@ class ConvertToTags:
            an empty tag function.
            """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
        self.__write_dec()
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__token_info)
-            if action != None:
-                action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__state_dict.get(self.__token_info)
+                if action is not None:
+                    action(line)
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -132,8 +132,7 @@ class DefaultEncoding:
                            self.__code_page = '850'

 # if __name__ == '__main__':
-    # from calibre.ebooks.rtf2xml import default_encoding
-    # encode_obj = default_encoding.DefaultEncoding(
+    # encode_obj = DefaultEncoding(
            # in_file = sys.argv[1],
            # bug_handler = Exception,
            # check_raw = True,