Global overhaul of rtf2xml : RTF fixes (1)

2025-07-09 03:04:10 -04:00 · 2010-07-31 10:47:12 +02:00 · 2010-07-31 10:47:12 +02:00 · 09c8f13a1f
commit 09c8f13a1f
parent 8512f57866
7 changed files with 52 additions and 77 deletions
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin):
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
+            #deb_dir = 'I:\\Calibre\\rtfdebug',
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol = 1,
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -143,7 +143,7 @@ class ParseRtf:
        self.__convert_wingdings = convert_wingdings
        self.__convert_zapf = convert_zapf
        self.__run_level = run_level
-        self.__exit_level = 0
+        #self.__exit_level = 0
        self.__indent = indent
        self.__replace_illegals = replace_illegals
        self.__form_lists = form_lists
@ -162,8 +162,7 @@ class ParseRtf:
        elif os.path.exists(the_file):
            pass # do nothing
        else:
-            message = "\nThe file '%s' cannot be found" % the_file
-            msg = message
+            msg = "\nThe file '%s' cannot be found" % the_file
            raise RtfInvalidCodeException, msg
    def __check_dir(self, the_dir):
        """Check to see if directory exists"""
@ -180,8 +179,7 @@ class ParseRtf:
            test = codecs.open(the_file, 'r', 'ascii', 'strict')
            test.close()
        except UnicodeError:
-            message= "\n%s is not a correct ascii file" % the_file
-            msg = message
+            msg = "\n%s is not a correct ascii file" % the_file
            raise RtfInvalidCodeException, msg
        return 1
    def parse_rtf(self):
@ -204,27 +202,29 @@ class ParseRtf:
            copy_obj.set_dir(self.__debug_dir)
            copy_obj.remove_files()
            copy_obj.copy_file(self.__temp_file, "original_file")
-        # new as of 2005-08-02. Do I want this?
+        # Function to check if bracket are well handled
        if self.__debug_dir or self.__run_level > 2:
            self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                    )
-        # convert Macintosh line endings to Unix line endings
+        # convert Macintosh and Windows line endings to Unix line endings
+        #why do this if you don't wb after?
        line_obj = line_endings.FixLineEndings(
                in_file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                copy = self.__copy,
-                run_level = self.__run_level,
+                #run_level = self.__run_level,
                replace_illegals = self.__replace_illegals,
                )
-        return_value = line_obj.fix_endings()
-        self.__return_code(return_value)
+        line_obj.fix_endings()
+        #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
+        #self.__return_code(return_value)
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
                in_file = self.__temp_file,
-                copy = self.__copy,
-                run_level = self.__run_level,)
+                copy = self.__copy,)
+                #run_level = self.__run_level,)
        tokenize_obj.tokenize()
        process_tokens_obj = process_tokens.ProcessTokens(
            in_file = self.__temp_file,
@ -529,7 +529,7 @@ class ParseRtf:
            )
        output_obj.output()
        os.remove(self.__temp_file)
-        return self.__exit_level
+        #return self.__exit_level
    def __bracket_match(self, file_name):
        if self.__run_level > 2:
            good_br, msg =  self.__check_brack_obj.check_brackets()
@ -539,26 +539,17 @@ class ParseRtf:
            else:
                msg += msg +  " in file '" + file_name + "'\n"
                raise RtfInvalidCodeException, msg
-    def __return_code(self, num):
-        if num == None:
-            return
-        if int(num) > self.__exit_level:
-            self.__exit_level = num
+    #def __return_code(self, num): calibre not used
+    #   if num == None:
+    #       return
+    #   if int(num) > self.__exit_level:
+    #       self.__exit_level = num
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
        write_obj = open(write_file, 'w')
-        line = "dummy"
-        while line:
-            line = read_obj.read(1000)
+        for line in read_obj:
            write_obj.write(line)
        write_obj.close()
        return write_file
-    """
-mi<tg<open______<style-sheet\n
-mi<tg<close_____<style-sheet\n
-mi<tg<open-att__<footnote<num>1\n
-mi<tg<empty-att_<page-definition<margin>33\n
-mi<tg<empty_____<para\n
-"""
--- a/src/calibre/ebooks/rtf2xml/check_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/check_brackets.py
@ -34,18 +34,16 @@ class CheckBrackets:
        try:
            last_num = self.__open_bracket_num.pop()
        except:
-            return 0
+            return False
        if num != last_num:
-            return 0
+            return False
        self.__bracket_count -= 1
-        return 1
+        return True
    def check_brackets(self):
        read_obj = open(self.__file, 'r')
-        line = 'dummy'
        line_count = 0
-        while line:
+        for line in read_obj:
            line_count += 1
-            line = read_obj.readline()
            self.__token_info = line[:16]
            if self.__token_info == 'ob<nu<open-brack':
                self.open_brack(line)
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -23,43 +23,31 @@ class FixLineEndings:
            bug_handler,
            in_file = None,
            copy = None,
-            run_level = 1,
+            #run_level = 1, calibre why keep it?
            replace_illegals = 1,
            ):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
-        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
    def fix_endings(self):
-        ##tempFileName = tempfile.mktemp()
        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
-        #nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  14, 15, 16, 17, 18, 19]
-        """
-read_obj = open(self.__file, 'r')
-line = read_obj.read(1000)
-regexp = re.compile(r"\r")
-macintosh = regexp.search(line)
-read_obj.close()
-        """
        # always check since I have to get rid of illegal characters
-        macintosh = 1
-        if macintosh:
-            line = 1
+        #read
        read_obj = open(self.__file, 'r')
-            write_obj = open(self.__write_to, 'w')
-            while line:
-                line = read_obj.read(1000)
-                # line = re.sub(regexp,"\n",line)
-                line = line.replace ('\r', '\n')
-                if self.__replace_illegals:
-                    line = re.sub(illegal_regx, '', line)
-                    # for num in nums:
-                        # line = line.replace(chr(num), '')
-                write_obj.write(line )
+        input_file = read_obj.read()
        read_obj.close()
+        #calibre go from win and mac to unix
+        input_file = input_file.replace ('\r\n', '\n')
+        input_file = input_file.replace ('\r', '\n')
+        if self.__replace_illegals:
+            input_file = re.sub(illegal_regx, '', input_file)
+        #write
+        write_obj = open(self.__write_to, 'wb')
+        write_obj.write(input_file)
        write_obj.close()
+        #copy
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "line_endings.data")
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -645,10 +645,8 @@ class ProcessTokens:
        return 'tx<nu<__________<%s\n' % token
    def ob_func(self, pre, token, num=None):
        self.__bracket_count += 1
-        ##return 'ob<%04d\n' % self.__bracket_count
        return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
    def cb_func(self, pre, token, num=None):
-        ##line = 'cb<%04d\n' % self.__bracket_count
        line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
        self.__bracket_count -= 1
        return line
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -23,7 +23,7 @@ class Tokenize:
            in_file,
            bug_handler,
            copy = None,
-            run_level = 1,
+            #run_level = 1,
    ):
        self.__file = in_file
        self.__bug_handler = bug_handler
@ -80,7 +80,7 @@ class Tokenize:
    def __create_tokens(self):
        self.__compile_expressions()
        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
+        write_obj = open(self.__write_to, 'wb')
        line_to_read = "dummy"
        while line_to_read:
            line_to_read = read_obj.readline()
@ -106,7 +106,7 @@ class Tokenize:
        write_obj.close()
    def tokenize(self):
        """Main class for handling other methods. Reads in one line \
-        at a time, usues method self.sub_line to make basic substitutions,\
+        at a time, uses method self.sub_line to make basic substitutions,\
        uses ? to process tokens"""
        self.__create_tokens()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -4,8 +4,7 @@
 Read content from txt file.
 '''

-import os
-import re
+import os, re

 from calibre import prepare_string_for_xml
 from calibre.ebooks.markdown import markdown