From a9fd0ad4ba9acdcc07d5bfcae503c378c25a7303 Mon Sep 17 00:00:00 2001
From: Sengian <sengian1@gmail.com>
Date: Mon, 16 Aug 2010 10:08:59 +0200
Subject: [PATCH] Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections
 and regression correction

---
 src/calibre/ebooks/rtf/input.py              |   2 +-
 src/calibre/ebooks/rtf2xml/ParseRtf.py       |  14 +-
 src/calibre/ebooks/rtf2xml/check_encoding.py |  11 +-
 src/calibre/ebooks/rtf2xml/copy.py           |  14 +-
 src/calibre/ebooks/rtf2xml/process_tokens.py | 163 ++++++++++---------
 5 files changed, 104 insertions(+), 100 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 824da7d6f1..f4fbdf411c 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
         parser = ParseRtf(
             in_file    = stream,
             out_file   = ofile,
-			deb_dir = 'I:\\Calibre\\rtfdebug',
+			deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug',
             # Convert symbol fonts to unicode equivalents. Default
             # is 1
             convert_symbol = 1,
diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index 76bdcc08af..1230ae150e 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -120,8 +120,6 @@ class ParseRtf:
             script tries to output to directory where is script is exectued.)
             'deb_dir' --debug directory. If a debug_dir is provided, the script
             will copy each run through as a file to examine in the debug_dir
-            'perl_script'--use perl to make tokens. This runs just a bit faster.
-            (I will probably phase this out.)
             'check_brackets' -- make sure the brackets match up after each run
             through a file. Only for debugging.
         Returns: Nothing
@@ -142,7 +140,7 @@ class ParseRtf:
         self.__convert_wingdings = convert_wingdings
         self.__convert_zapf = convert_zapf
         self.__run_level = run_level
-        #self.__exit_level = 0
+        #self.__exit_level = 0 See what this means and if it is consistent
         self.__indent = indent
         self.__replace_illegals = replace_illegals
         self.__form_lists = form_lists
@@ -184,19 +182,15 @@ class ParseRtf:
             A parsed file in XML, either to standard output or to a file,
             depending on the value of 'output' when the instance was created.
         """
-        self.__temp_file = self.__make_temp_file(self.__file)
         #Check to see if the file is correct ascii first
         check_encoding_obj = check_encoding.CheckEncoding(
                 bug_handler = RtfInvalidCodeException,
                     )
         if check_encoding_obj.check_encoding(self.__file):
-            try:
-                os.remove(self.__temp_file)
-            except OSError:
-                pass
             sys.stderr.write('File "%s" does not appear to be ascii.\n' \
                  % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
             raise InvalidRtfException
+        self.__temp_file = self.__make_temp_file(self.__file)
         # if the self.__deb_dir is true, then create a copy object,
         # set the directory to write to, remove files, and copy
         # the new temporary file to this directory
@@ -223,7 +217,6 @@ class ParseRtf:
                 replace_illegals = self.__replace_illegals,
                 )
         line_obj.fix_endings()
-        #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
         #self.__return_code(return_value)
         tokenize_obj = tokenize.Tokenize(
                 bug_handler = RtfInvalidCodeException,
@@ -550,6 +543,7 @@ class ParseRtf:
         write_file="rtf_write_file"
         read_obj = file if hasattr(file, 'read') else open(file,'r')
         write_obj = open(write_file, 'wb')
-        write_obj.write(read_obj.read())
+        for line in read_obj:
+            write_obj.write(line)
         write_obj.close()
         return write_file
\ No newline at end of file
diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py
index 1f8645bb0c..444fd373e4 100755
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@@ -14,10 +14,10 @@ class CheckEncoding:
                 sys.stderr.write(str(msg) + '\n')
     def check_encoding(self, path, encoding='us-ascii'):
         read_obj = open(path, 'r')
-        input_file = read_obj.read()
-        read_obj.close()
+        
         line_num = 0
-        for line in input_file:
+        error_found = False
+        for line in read_obj:
             line_num += 1
             try:
                 line.decode(encoding)
@@ -26,8 +26,9 @@ class CheckEncoding:
                     self.__get_position_error(line, encoding, line_num)
                 else:
                     sys.stderr.write('line: %d has bad encoding\n'%line_num)
-                return True
-        return False
+                error_found = True
+        read_obj.close()
+        return error_found
 
 if __name__ == '__main__':
     check_encoding_obj = CheckEncoding()
diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py
index ff029c1841..1b620b9fbf 100755
--- a/src/calibre/ebooks/rtf2xml/copy.py
+++ b/src/calibre/ebooks/rtf2xml/copy.py
@@ -23,6 +23,7 @@ class Copy:
     def __init__(self, bug_handler, file = None, deb_dir = None, ):
         self.__file = file
         self.__bug_handler = bug_handler
+
     def set_dir(self, deb_dir):
         """Set the temporary directory to write files to"""
         if deb_dir is None:
@@ -33,19 +34,11 @@ class Copy:
             message = "%(deb_dir)s is not a directory" % vars()
             raise self.__bug_handler , message
         Copy.__dir = deb_dir
+
     def remove_files(self ):
         """Remove files from directory"""
         self.__remove_the_files(Copy.__dir)
-        """
-        list_of_files = os.listdir(Copy.__dir)
-        list_of_files = os.listdir(the_dir)
-        for file in list_of_files:
-            rem_file = os.path.join(Copy.__dir,file)
-            if os.path.isdir(rem_file):
-                self.remove_files(rem_file)
-            else:
-                os.remove(rem_file)
-        """
+
     def __remove_the_files(self, the_dir):
         """Remove files from directory"""
         list_of_files = os.listdir(the_dir)
@@ -58,6 +51,7 @@ class Copy:
                     os.remove(rem_file)
                 except OSError:
                     pass
+
     def copy_file(self, file, new_file):
         """
         Copy the file to a new name
diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py
index 072d8b02e4..2c5c0c7df0 100755
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@@ -735,8 +735,94 @@ class ProcessTokens:
         pre, token, action = self.dict_token.get(token, (None, None, None))
         if action:
             return action(pre, token, num)
-    # unused function
-    def initiate_token_actions(self):
+    
+    def __check_brackets(self, in_file):
+        self.__check_brack_obj = check_brackets.CheckBrackets\
+            (file = in_file)
+        good_br =  self.__check_brack_obj.check_brackets()[0]
+        if not good_br:
+            return 1
+    def process_tokens(self):
+        """Main method for handling other methods. """
+        
+        read_obj= open(self.__file, 'r')
+        write_obj = open(self.__write_to, 'wb')
+        
+        '''first_token = 0
+        second_token = 0'''
+        line_count = 0
+        
+        for line in read_obj:
+            token = line.replace("\n","")
+            #calibre not necessary normaly, fixed in tokenize
+            '''if not token:
+                continue'''
+            line_count += 1
+            #calibre not necessary, encoding checked before 
+            """try:
+                token.decode('us-ascii')
+            except UnicodeError, msg:
+                msg = str(msg)
+                msg += 'Invalid RTF: File not ascii encoded.\n'
+                raise self.__exception_handler, msg"""
+            #calibre: with tokenize, should be first and second line, why bother?
+            """if not first_token:
+                if token != '\\{':
+                    msg = 'Invalid RTF: document doesn\'t start with {\n'
+                    raise self.__exception_handler, msg
+                first_token = 1
+            elif line_count ==  and not second_token:
+                if token[0:4] != '\\rtf':
+                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
+                    raise self.__exception_handler, msg
+                second_token = 1"""
+            if line_count == 1 and token != '\\{':
+                    msg = 'Invalid RTF: document doesn\'t start with {\n'
+                    raise self.__exception_handler, msg
+            elif line_count == 2 and token[0:4] != '\\rtf':
+                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
+                    raise self.__exception_handler, msg
+            
+            ##token = self.evaluate_token(token)
+            the_index = token.find('\\ ')
+            if token is not None and  the_index > -1:
+                msg ='Invalid RTF: token "\\ " not valid.\n'
+                raise self.__exception_handler, msg
+            elif token[:1] == "\\":
+                line = self.process_cw(token)
+                if line is not None:
+                    write_obj.write(line)
+            else:
+                fields = re.split(self.__utf_exp, token)
+                for field in fields:
+                    if not field:
+                        continue
+                    if field[0:1] == '&':
+                        write_obj.write('tx<ut<__________<%s\n' % field)
+                    else:
+                        write_obj.write('tx<nu<__________<%s\n' % field)
+        
+        read_obj.close()
+        write_obj.close()
+        
+        if not line_count:
+            msg ='Invalid RTF: file appears to be empty.\n'
+            raise self.__exception_handler, msg
+        
+        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        
+        bad_brackets = self.__check_brackets(self.__file)
+        if bad_brackets:
+            msg = 'Invalid RTF: document does not have matching brackets.\n'
+            raise self.__exception_handler, msg
+        else:
+            return self.__return_code
+
+    '''def initiate_token_actions(self):
         self.action_for_token={
         '{'     :   self.ob_func,
         '}'     :   self.cb_func,
@@ -752,75 +838,4 @@ class ProcessTokens:
             line = action(token)
             return line
         else :
-            return  'tx<nu<nu<nu<nu<%s\n' % token
-    def __check_brackets(self, in_file):
-        self.__check_brack_obj = check_brackets.CheckBrackets\
-            (file = in_file)
-        good_br =  self.__check_brack_obj.check_brackets()[0]
-        if not good_br:
-            return 1
-    def process_tokens(self):
-        """Main method for handling other methods. """
-        first_token = 0
-        second_token = 0
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = "dummy"
-        line_count = 0
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            token = line_to_read
-            token = token.replace("\n","")
-            if not token:
-                continue
-            line_count += 1
-            try:
-                token.decode('us-ascii')
-            except UnicodeError, msg:
-                msg = str(msg)
-                msg += 'Invalid RTF: File not ascii encoded.\n'
-                raise self.__exception_handler, msg
-            if not first_token:
-                if token != '\\{':
-                    msg = 'Invalid RTF: document doesn\'t start with {\n'
-                    raise self.__exception_handler, msg
-                first_token = 1
-            elif first_token and not second_token:
-                if token[0:4] != '\\rtf':
-                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
-                    raise self.__exception_handler, msg
-                second_token = 1
-            ##token = self.evaluate_token(token)
-            the_index = token.find('\\ ')
-            if token != None and  the_index > -1:
-                msg ='Invalid RTF: token "\\ " not valid. \n'
-                raise self.__exception_handler, msg
-            elif token[0:1] == "\\":
-                line = self.process_cw(token)
-                if line != None:
-                    write_obj.write(line)
-            else:
-                fields = re.split(self.__utf_exp, token)
-                for field in fields:
-                    if not field:
-                        continue
-                    if field[0:1] == '&':
-                        write_obj.write('tx<ut<__________<%s\n' % field)
-                    else:
-                        write_obj.write('tx<nu<__________<%s\n' % field)
-        read_obj.close()
-        write_obj.close()
-        if not line_count:
-            msg ='Invalid RTF: file appears to be empty. \n'
-            raise self.__exception_handler, msg
-        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
-        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
-        copy_obj.rename(self.__write_to, self.__file)
-        os.remove(self.__write_to)
-        bad_brackets = self.__check_brackets(self.__file)
-        if bad_brackets:
-            msg = 'Invalid RTF: document does not have matching brackets.\n'
-            raise self.__exception_handler, msg
-        else:
-            return self.__return_code
+            return  'tx<nu<nu<nu<nu<%s\n' % token'''