Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections and regression correction

2025-07-31 14:33:54 -04:00 · 2010-08-16 10:08:59 +02:00 · 2010-08-16 10:08:59 +02:00 · a9fd0ad4ba
commit a9fd0ad4ba
parent b9ed0c6b3d
5 changed files with 104 additions and 100 deletions
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
-			deb_dir = 'I:\\Calibre\\rtfdebug',
+			deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug',
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol = 1,
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -120,8 +120,6 @@ class ParseRtf:
            script tries to output to directory where is script is exectued.)
            'deb_dir' --debug directory. If a debug_dir is provided, the script
            will copy each run through as a file to examine in the debug_dir
            'perl_script'--use perl to make tokens. This runs just a bit faster.
            (I will probably phase this out.)
            'check_brackets' -- make sure the brackets match up after each run
            through a file. Only for debugging.
        Returns: Nothing
@ -142,7 +140,7 @@ class ParseRtf:
        self.__convert_wingdings = convert_wingdings
        self.__convert_zapf = convert_zapf
        self.__run_level = run_level
-        #self.__exit_level = 0
+        #self.__exit_level = 0 See what this means and if it is consistent
        self.__indent = indent
        self.__replace_illegals = replace_illegals
        self.__form_lists = form_lists
@ -184,19 +182,15 @@ class ParseRtf:
            A parsed file in XML, either to standard output or to a file,
            depending on the value of 'output' when the instance was created.
        """
        self.__temp_file = self.__make_temp_file(self.__file)
        #Check to see if the file is correct ascii first
        check_encoding_obj = check_encoding.CheckEncoding(
                bug_handler = RtfInvalidCodeException,
                    )
        if check_encoding_obj.check_encoding(self.__file):
            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            sys.stderr.write('File "%s" does not appear to be ascii.\n' \
                 % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
            raise InvalidRtfException
        self.__temp_file = self.__make_temp_file(self.__file)
        # if the self.__deb_dir is true, then create a copy object,
        # set the directory to write to, remove files, and copy
        # the new temporary file to this directory
@ -223,7 +217,6 @@ class ParseRtf:
                replace_illegals = self.__replace_illegals,
                )
        line_obj.fix_endings()
        #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
        #self.__return_code(return_value)
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
@ -550,6 +543,7 @@ class ParseRtf:
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
        write_obj = open(write_file, 'wb')
-        write_obj.write(read_obj.read())
+        for line in read_obj:
            write_obj.write(line)
        write_obj.close()
        return write_file
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -14,10 +14,10 @@ class CheckEncoding:
                sys.stderr.write(str(msg) + '\n')
    def check_encoding(self, path, encoding='us-ascii'):
        read_obj = open(path, 'r')
-        input_file = read_obj.read()
+        
        read_obj.close()
        line_num = 0
-        for line in input_file:
+        error_found = False
        for line in read_obj:
            line_num += 1
            try:
                line.decode(encoding)
@ -26,8 +26,9 @@ class CheckEncoding:
                    self.__get_position_error(line, encoding, line_num)
                else:
                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
-                return True
+                error_found = True
-        return False
+        read_obj.close()
        return error_found
 if __name__ == '__main__':
    check_encoding_obj = CheckEncoding()
--- a/src/calibre/ebooks/rtf2xml/copy.py
+++ b/src/calibre/ebooks/rtf2xml/copy.py
@ -23,6 +23,7 @@ class Copy:
    def __init__(self, bug_handler, file = None, deb_dir = None, ):
        self.__file = file
        self.__bug_handler = bug_handler
    def set_dir(self, deb_dir):
        """Set the temporary directory to write files to"""
        if deb_dir is None:
@ -33,19 +34,11 @@ class Copy:
            message = "%(deb_dir)s is not a directory" % vars()
            raise self.__bug_handler , message
        Copy.__dir = deb_dir
    def remove_files(self ):
        """Remove files from directory"""
        self.__remove_the_files(Copy.__dir)
-        """
+
        list_of_files = os.listdir(Copy.__dir)
        list_of_files = os.listdir(the_dir)
        for file in list_of_files:
            rem_file = os.path.join(Copy.__dir,file)
            if os.path.isdir(rem_file):
                self.remove_files(rem_file)
            else:
                os.remove(rem_file)
        """
    def __remove_the_files(self, the_dir):
        """Remove files from directory"""
        list_of_files = os.listdir(the_dir)
@ -58,6 +51,7 @@ class Copy:
                    os.remove(rem_file)
                except OSError:
                    pass
    def copy_file(self, file, new_file):
        """
        Copy the file to a new name
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -735,8 +735,94 @@ class ProcessTokens:
        pre, token, action = self.dict_token.get(token, (None, None, None))
        if action:
            return action(pre, token, num)
-    # unused function
+    
-    def initiate_token_actions(self):
+    def __check_brackets(self, in_file):
        self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
        good_br =  self.__check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
    def process_tokens(self):
        """Main method for handling other methods. """
        read_obj= open(self.__file, 'r')
        write_obj = open(self.__write_to, 'wb')
        '''first_token = 0
        second_token = 0'''
        line_count = 0
        for line in read_obj:
            token = line.replace("\n","")
            #calibre not necessary normaly, fixed in tokenize
            '''if not token:
                continue'''
            line_count += 1
            #calibre not necessary, encoding checked before 
            """try:
                token.decode('us-ascii')
            except UnicodeError, msg:
                msg = str(msg)
                msg += 'Invalid RTF: File not ascii encoded.\n'
                raise self.__exception_handler, msg"""
            #calibre: with tokenize, should be first and second line, why bother?
            """if not first_token:
                if token != '\\{':
                    msg = 'Invalid RTF: document doesn\'t start with {\n'
                    raise self.__exception_handler, msg
                first_token = 1
            elif line_count ==  and not second_token:
                if token[0:4] != '\\rtf':
                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
                    raise self.__exception_handler, msg
                second_token = 1"""
            if line_count == 1 and token != '\\{':
                    msg = 'Invalid RTF: document doesn\'t start with {\n'
                    raise self.__exception_handler, msg
            elif line_count == 2 and token[0:4] != '\\rtf':
                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
                    raise self.__exception_handler, msg
            ##token = self.evaluate_token(token)
            the_index = token.find('\\ ')
            if token is not None and  the_index > -1:
                msg ='Invalid RTF: token "\\ " not valid.\n'
                raise self.__exception_handler, msg
            elif token[:1] == "\\":
                line = self.process_cw(token)
                if line is not None:
                    write_obj.write(line)
            else:
                fields = re.split(self.__utf_exp, token)
                for field in fields:
                    if not field:
                        continue
                    if field[0:1] == '&':
                        write_obj.write('tx<ut<__________<%s\n' % field)
                    else:
                        write_obj.write('tx<nu<__________<%s\n' % field)
        read_obj.close()
        write_obj.close()
        if not line_count:
            msg ='Invalid RTF: file appears to be empty.\n'
            raise self.__exception_handler, msg
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = 'Invalid RTF: document does not have matching brackets.\n'
            raise self.__exception_handler, msg
        else:
            return self.__return_code
    '''def initiate_token_actions(self):
        self.action_for_token={
        '{'     :   self.ob_func,
        '}'     :   self.cb_func,
@ -752,75 +838,4 @@ class ProcessTokens:
            line = action(token)
            return line
        else :
-            return  'tx<nu<nu<nu<nu<%s\n' % token
+            return  'tx<nu<nu<nu<nu<%s\n' % token'''
    def __check_brackets(self, in_file):
        self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
        good_br =  self.__check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
    def process_tokens(self):
        """Main method for handling other methods. """
        first_token = 0
        second_token = 0
        read_obj = open(self.__file, 'r')
        write_obj = open(self.__write_to, 'w')
        line_to_read = "dummy"
        line_count = 0
        while line_to_read:
            line_to_read = read_obj.readline()
            token = line_to_read
            token = token.replace("\n","")
            if not token:
                continue
            line_count += 1
            try:
                token.decode('us-ascii')
            except UnicodeError, msg:
                msg = str(msg)
                msg += 'Invalid RTF: File not ascii encoded.\n'
                raise self.__exception_handler, msg
            if not first_token:
                if token != '\\{':
                    msg = 'Invalid RTF: document doesn\'t start with {\n'
                    raise self.__exception_handler, msg
                first_token = 1
            elif first_token and not second_token:
                if token[0:4] != '\\rtf':
                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
                    raise self.__exception_handler, msg
                second_token = 1
            ##token = self.evaluate_token(token)
            the_index = token.find('\\ ')
            if token != None and  the_index > -1:
                msg ='Invalid RTF: token "\\ " not valid. \n'
                raise self.__exception_handler, msg
            elif token[0:1] == "\\":
                line = self.process_cw(token)
                if line != None:
                    write_obj.write(line)
            else:
                fields = re.split(self.__utf_exp, token)
                for field in fields:
                    if not field:
                        continue
                    if field[0:1] == '&':
                        write_obj.write('tx<ut<__________<%s\n' % field)
                    else:
                        write_obj.write('tx<nu<__________<%s\n' % field)
        read_obj.close()
        write_obj.close()
        if not line_count:
            msg ='Invalid RTF: file appears to be empty. \n'
            raise self.__exception_handler, msg
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = 'Invalid RTF: document does not have matching brackets.\n'
            raise self.__exception_handler, msg
        else:
            return self.__return_code