Global overhaul of rtf2xml: RTFfixes (4) ->minors corrections in line endings and check brackets, move check encoding first to eliminate non ascii RTF

2025-07-09 03:04:10 -04:00 · 2010-08-12 17:16:37 +02:00 · 2010-08-12 17:16:37 +02:00 · b9ed0c6b3d
commit b9ed0c6b3d
parent 7c70914ad3
4 changed files with 33 additions and 29 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -133,7 +133,6 @@ class ParseRtf:
        self.__temp_dir = out_dir
        self.__dtd_path = dtd
        self.__check_file(in_file,"file_to_parse")
        self.__check_ascii(in_file)
        self.__char_data = char_data
        self.__debug_dir = deb_dir
        self.__check_dir(self.__temp_dir)
@ -152,6 +151,7 @@ class ParseRtf:
        self.__group_borders = group_borders
        self.__empty_paragraphs = empty_paragraphs
        self.__no_dtd = no_dtd
    def __check_file(self, the_file, type):
        """Check to see if files exist"""
        if hasattr(the_file, 'read'): return
@ -164,6 +164,7 @@ class ParseRtf:
        else:
            msg = "\nThe file '%s' cannot be found" % the_file
            raise RtfInvalidCodeException, msg
    def __check_dir(self, the_dir):
        """Check to see if directory exists"""
        if not the_dir :
@ -173,15 +174,7 @@ class ParseRtf:
            msg = "\n%s is not a directory" % the_dir
            raise RtfInvalidCodeException, msg
        return 1
-    def __check_ascii(self, the_file):
+
        """Check to see if the file is correct ascii"""
        try:
            test = codecs.open(the_file, 'r', 'ascii', 'strict')
            test.close()
        except UnicodeError:
            msg = "\n%s is not a correct ascii file" % the_file
            raise RtfInvalidCodeException, msg
        return 1
    def parse_rtf(self):
        """
        Parse the file by calling on other classes.
@ -192,6 +185,18 @@ class ParseRtf:
            depending on the value of 'output' when the instance was created.
        """
        self.__temp_file = self.__make_temp_file(self.__file)
        #Check to see if the file is correct ascii first
        check_encoding_obj = check_encoding.CheckEncoding(
                bug_handler = RtfInvalidCodeException,
                    )
        if check_encoding_obj.check_encoding(self.__file):
            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            sys.stderr.write('File "%s" does not appear to be ascii.\n' \
                 % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
            raise InvalidRtfException
        # if the self.__deb_dir is true, then create a copy object,
        # set the directory to write to, remove files, and copy
        # the new temporary file to this directory
@ -214,7 +219,7 @@ class ParseRtf:
                in_file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                copy = self.__copy,
-                #run_level = self.__run_level,
+                run_level = self.__run_level,
                replace_illegals = self.__replace_illegals,
                )
        line_obj.fix_endings()
@ -223,8 +228,8 @@ class ParseRtf:
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
                in_file = self.__temp_file,
-                copy = self.__copy,)
+                copy = self.__copy,
-                #run_level = self.__run_level,)
+                run_level = self.__run_level)
        tokenize_obj.tokenize()
        process_tokens_obj = process_tokens.ProcessTokens(
            in_file = self.__temp_file,
@ -240,10 +245,6 @@ class ParseRtf:
                os.remove(self.__temp_file)
            except OSError:
                pass
            check_encoding_obj = check_encoding.CheckEncoding(
                bug_handler = RtfInvalidCodeException,
                    )
            check_encoding_obj.check_encoding(self.__file)
            sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
@ -548,8 +549,7 @@ class ParseRtf:
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
-        write_obj = open(write_file, 'w')
+        write_obj = open(write_file, 'wb')
-        for line in read_obj:
+        write_obj.write(read_obj.read())
            write_obj.write(line)
        write_obj.close()
        return write_file
--- a/src/calibre/ebooks/rtf2xml/check_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/check_brackets.py
@ -30,7 +30,6 @@ class CheckBrackets:
        self.__bracket_count += 1
    def close_brack(self, line):
        num = line[-5:-1]
        ##self.__open_bracket_num.append(num)
        try:
            last_num = self.__open_bracket_num.pop()
        except:
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -14,12 +14,11 @@ class CheckEncoding:
                sys.stderr.write(str(msg) + '\n')
    def check_encoding(self, path, encoding='us-ascii'):
        read_obj = open(path, 'r')
-        line_to_read = 1
+        input_file = read_obj.read()
        read_obj.close()
        line_num = 0
-        while line_to_read:
+        for line in input_file:
            line_num += 1
            line_to_read = read_obj.readline()
            line = line_to_read
            try:
                line.decode(encoding)
            except UnicodeError:
@ -27,6 +26,9 @@ class CheckEncoding:
                    self.__get_position_error(line, encoding, line_num)
                else:
                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
                return True
        return False
 if __name__ == '__main__':
    check_encoding_obj = CheckEncoding()
    check_encoding_obj.check_encoding(sys.argv[1])
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -23,7 +23,7 @@ class FixLineEndings:
            bug_handler,
            in_file = None,
            copy = None,
-            #run_level = 1, calibre why keep it?
+            run_level = 1,
            replace_illegals = 1,
            ):
        self.__file = in_file
@ -32,8 +32,11 @@ class FixLineEndings:
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
    def fix_endings(self):
-        illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
+        #remove ASCII invalid chars : 0 to 8 and 11-14 to 24
        #always check since I have to get rid of illegal characters
        chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
        illegal_regx = re.compile(u'|'.join(map(unichr, chars)))
        #illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
        #read
        read_obj = open(self.__file, 'r')
        input_file = read_obj.read()
@ -42,7 +45,7 @@ class FixLineEndings:
        input_file = input_file.replace ('\r\n', '\n')
        input_file = input_file.replace ('\r', '\n')
        if self.__replace_illegals:
-            input_file = re.sub(illegal_regx, '', input_file)
+            input_file = illegal_regx.sub('', input_file)
        #write
        write_obj = open(self.__write_to, 'wb')
        write_obj.write(input_file)