RTFInput: Cleanup & small improvments

2025-07-09 03:04:10 -04:00 · 2012-05-05 15:34:00 +02:00 · 2012-05-05 15:34:00 +02:00 · 60b53045e4
commit 60b53045e4
parent bd5e6585ff
7 changed files with 298 additions and 264 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -372,8 +372,8 @@ class ParseRtf:
        old_rtf = old_rtf_obj.check_if_old_rtf()
        if old_rtf:
            if self.__run_level > 5:
-                msg = 'Older RTF\n'
-                msg += 'self.__run_level is "%s"\n' % self.__run_level
+                msg = 'Older RTF\n' \
+                'self.__run_level is "%s"\n' % self.__run_level
                raise RtfInvalidCodeException, msg
            if self.__run_level > 1:
                sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
                if self.__run_level > 1:
                    sys.stderr.write(
                        'File also has newer RTF.\n'
-                        'Will do the best to convert.\n'
+                        'Will do the best to convert...\n'
                    )
            add_brackets_obj = add_brackets.AddBrackets(
                    in_file = self.__temp_file,
--- a/src/calibre/ebooks/rtf2xml/add_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/add_brackets.py
@ -20,6 +20,9 @@ class AddBrackets:
    """
    Add brackets for old RTF.
    Logic:
+    When control words without their own brackets are encountered
+    and in the list of allowed words, this will add brackets
+    to facilitate the treatment of the file
    """
    def __init__(self, in_file,
            bug_handler,
@ -41,26 +44,17 @@ class AddBrackets:
        self.__copy = copy
        self.__write_to = better_mktemp()
        self.__run_level = run_level
-
-    def __initiate_values(self):
-        """
-        """
        self.__state_dict = {
            'before_body'           : self.__before_body_func,
            'in_body'               : self.__in_body_func,
            'after_control_word'    : self.__after_control_word_func,
            'in_ignore'             : self.__ignore_func,
        }
-        self.__state = 'before_body'
-        self.__inline = {}
-        self.__temp_group = []
-        self.__open_bracket = False
-        self.__found_brackets = False
        self.__accept = [
            'cw<ci<bold______' ,
            'cw<ci<annotation' ,
            'cw<ci<blue______' ,
-        'cw<ci<bold______' ,
+            # 'cw<ci<bold______' ,
            'cw<ci<caps______' ,
            'cw<ci<char-style' ,
            'cw<ci<dbl-strike' ,
@ -86,6 +80,16 @@ class AddBrackets:
            # 'cw<ul<underlined' ,
        ]

+    def __initiate_values(self):
+        """
+        """
+        self.__state = 'before_body'
+        self.__inline = {}
+        self.__temp_group = []
+        self.__open_bracket = False
+        self.__found_brackets = False
+        
+
    def __before_body_func(self, line):
        """
        """
--- a/src/calibre/ebooks/rtf2xml/header.py
+++ b/src/calibre/ebooks/rtf2xml/header.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

@ -31,29 +32,29 @@ class Header:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = better_mktemp()
-        self.__found_a_header = 0
+        self.__found_a_header = False
+
    def __in_header_func(self, line):
        """
        Handle all tokens that are part of header
        """
        if self.__cb_count == self.__header_bracket_count:
-            self.__in_header = 0
+            self.__in_header = False
            self.__write_obj.write(line)
            self.__write_to_head_obj.write(
-            'mi<mk<head___clo\n')
-            self.__write_to_head_obj.write(
-            'mi<tg<close_____<header-or-footer\n')
-            self.__write_to_head_obj.write(
+            'mi<mk<head___clo\n' \
+            'mi<tg<close_____<header-or-footer\n' \
            'mi<mk<header-clo\n')
        else:
            self.__write_to_head_obj.write(line)
+
    def __found_header(self, line):
        """
        Found a header
        """
        # but this could be header or footer
-        self.__found_a_header = 1
-        self.__in_header = 1
+        self.__found_a_header = True
+        self.__in_header = True
        self.__header_count += 1
        # temporarily set this to zero so I can enter loop
        self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
                    'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
                    )
        else:
-            sys.stderr.write('module is header\n')
-            sys.stderr.write('method is __found_header\n')
-            sys.stderr.write('no dict entry\n')
-            sys.stderr.write('line is %s' % line)
+            sys.stderr.write(
+            'module is header\n' \
+            'method is __found_header\n' \
+            'no dict entry\n' \
+            'line is %s' % line)
            self.__write_to_head_obj.write(
                    'mi<tg<open-att__<header-or-footer<type>none\n'
                    )
+
    def __default_sep(self, line):
-        """Handle all tokens that are not header tokens"""
+        """
+        Handle all tokens that are not header tokens
+        """
        if self.__token_info[3:5] == 'hf':
            self.__found_header(line)
        self.__write_obj.write(line)
+
    def __initiate_sep_values(self):
        """
        initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__header_bracket_count = 0
-        self.__in_header = 0
+        self.__in_header = False
        self.__header_count = 0
        self.__head_dict = {
            'head-left_'        :   ('header-left'),
@ -101,6 +107,7 @@ class Header:
            'header____'        :   ('header' ),
            'footer____'        :   ('footer' ),
        }
+
    def separate_headers(self):
        """
        Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,14 +117,11 @@ class Header:
        bottom of the main file.
        """
        self.__initiate_sep_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
        self.__header_holder = better_mktemp()
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
                        self.__token_info = line[:16]
                        # keep track of opening and closing brackets
                        if self.__token_info == 'ob<nu<open-brack':
@ -130,33 +134,30 @@ class Header:
                        # not in the middle of footnote text
                        else:
                            self.__default_sep(line)
-        self.__write_obj.close()
-        read_obj.close()
-        self.__write_to_head_obj.close()
-        read_obj = open(self.__header_holder, 'r')
-        write_obj = open(self.__write_to, 'a')
+        
+        with open(self.__header_holder, 'r') as read_obj:
+            with open(self.__write_to, 'a') as write_obj:
                write_obj.write(
                'mi<mk<header-beg\n')
-        line = 1
-        while line:
-            line = read_obj.readline()
+                for line in read_obj:
                    write_obj.write(line)
                write_obj.write(
                'mi<mk<header-end\n')
-        read_obj.close()
-        write_obj.close()
        os.remove(self.__header_holder)
+
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "header_separate.info")
+            copy_obj.copy_file(self.__write_to, "header_separate.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def update_info(self, file, copy):
        """
        Unused method
        """
        self.__file = file
        self.__copy = copy
+
    def __get_head_body_func(self, line):
        """
        Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
            self.__state = 'head'
        else:
            self.__write_obj.write(line)
+
    def __get_head_head_func(self, line):
        """
        Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
            self.__state = 'body'
        else:
            self.__write_to_head_obj.write(line)
+
    def __get_headers(self):
        """
        Private method to remove footnotes from main file.  Read one line from
@ -182,21 +185,16 @@ class Header:
        These two functions do the work of separating the footnotes form the
        body.
        """
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-            # self.__write_to = "footnote_info.data"
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
                        self.__token_info = line[:16]
                        if self.__state == 'body':
                            self.__get_head_body_func(line)
                        elif self.__state == 'head':
                            self.__get_head_head_func(line)
-        read_obj.close()
-        self.__write_obj.close()
-        self.__write_to_head_obj.close()
+
    def __get_head_from_temp(self, num):
        """
        Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
        returns them as a string.
        """
        look_for = 'mi<mk<header-ope<' + num + '\n'
-        found_head = 0
+        found_head = False
        string_to_return = ''
-        line = 1
-        while line:
-            line = self.__read_from_head_obj.readline()
+        for line in self.__read_from_head_obj:
            if found_head:
                if line == 'mi<mk<header-clo\n':
                    return string_to_return
-                string_to_return = string_to_return + line
+                string_to_return += line
            else:
                if line == look_for:
-                    found_head = 1
+                    found_head = True
+
    def __join_from_temp(self):
        """
        Private method for rejoining footnotes to body.  Read from the
@ -227,15 +224,13 @@ class Header:
        If no footnote marker is found, simply print out the token (line).
        """
        self.__read_from_head_obj = open(self.__header_holder, 'r')
-        read_obj = open(self.__write_to, 'r')
        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
+        with open(self.__write_to, 'r') as read_obj:
+            for line in read_obj:
                if line[:16] == 'mi<mk<header-ind':
                    line = self.__get_head_from_temp(line[17:-1])
                self.__write_obj.write(line)
-        read_obj.close()
+
    def join_headers(self):
        """
        Join the footnotes from the bottom of the file and put them in their
--- a/src/calibre/ebooks/rtf2xml/old_rtf.py
+++ b/src/calibre/ebooks/rtf2xml/old_rtf.py
@ -11,14 +11,18 @@
 #                                                                       #
 #########################################################################
 import sys
-"""
-"""
+
 class OldRtf:
    """
    Check to see if the RTF is an older version
    Logic:
+    If allowable control word/properties happen in text without being enclosed
+    in brackets the file will be considered old rtf
    """
-    def __init__(self, in_file, bug_handler, run_level ):
+    def __init__(self, in_file,
+                bug_handler,
+                run_level,
+                ):
        """
        Required:
            'file'--file to parse
@ -32,11 +36,7 @@ class OldRtf:
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
-        self.__initiate_values()
-        self.__ob_group = 0
-    def __initiate_values(self):
-        self.__previous_token = ''
-        self.__new_found = 0
+        self.__run_level = run_level
        self.__allowable = [
            'annotation' ,
            'blue______'  ,
@ -64,14 +64,18 @@ class OldRtf:
            'superscrip' ,
            'underlined' ,
        ]
-        self.__state = 'before_body'
        self.__action_dict = {
            'before_body'   : self.__before_body_func,
            'in_body'       : self.__check_tokens_func,
            'after_pard'    : self.__after_pard_func,
        }
-        self.__is_old = 0
+
+    def __initiate_values(self):
+        self.__previous_token = ''
+        self.__state = 'before_body'
        self.__found_new = 0
+        self.__ob_group = 0
+
    def __check_tokens_func(self, line):
        if self.__inline_info in self.__allowable:
            if self.__ob_group == self.__base_ob_count:
@ -80,32 +84,32 @@ class OldRtf:
                self.__found_new += 1
        elif self.__token_info ==  'cw<pf<par-def___':
            self.__state = 'after_pard'
+
    def __before_body_func(self, line):
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
            self.__base_ob_count = self.__ob_group
+
    def __after_pard_func(self, line):
        if line[0:2] != 'cw':
            self.__state = 'in_body'
+
    def check_if_old_rtf(self):
        """
        Requires:
            nothing
        Returns:
-            1 if file is older RTf
-            0 if file is newer RTF
+            True if file is older RTf
+            False if file is newer RTF
        """
-
-        read_obj = open(self.__file, 'r')
-        line = 1
+        self.__initiate_values()
        line_num = 0
-        while line:
-            line = read_obj.readline()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
                line_num += 1
                self.__token_info = line[:16]
                if self.__token_info == 'mi<mk<body-close':
-                return 0
-                self.__ob_group = 0
+                    return False
                if self.__token_info == 'ob<nu<open-brack':
                    self.__ob_group += 1
                    self.__ob_count = line[-5:-1]
@ -114,14 +118,22 @@ class OldRtf:
                    self.__cb_count = line[-5:-1]
                self.__inline_info = line[6:16]
                if self.__state == 'after_body':
-                return 0
+                    return False
                action = self.__action_dict.get(self.__state)
-            if not action:
-                sys.stderr.write('No action for state!\n')
+                if action is None:
+                    try:
+                        sys.stderr.write('No action for this state!\n')
+                    except:
+                        pass
                result = action(line)
                if result == 'new_rtf':
-                return 0
+                    return False
                elif result == 'old_rtf':
-                return 1
+                    if self.__run_level > 3:
+                        sys.stderr.write(
+                            'Old rtf construction %s (bracket %s, line %s)\n' 
+                                % (self.__inline_info, str(self.__ob_group), line_num)
+                        )
+                    return True
                self.__previous_token = line[6:16]
-        return 0
+        return False
--- a/src/calibre/ebooks/rtf2xml/paragraphs.py
+++ b/src/calibre/ebooks/rtf2xml/paragraphs.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

@ -34,7 +35,7 @@ a blank paragraph.
    Once a paragraph is found, the state changes to 'paragraph.' In this state,
    clues are looked to for the end of a paragraph. The end of a paragraph marker
    (\par) marks the end of a paragraph. So does the end of a footnote or heading;
-a paragraph definintion; the end of a field-block; and the beginning of a
+    a paragraph definition; the end of a field-block; and the beginning of a
    section. (How about the end of a section or the end of a field-block?)
    """
    def __init__(self,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_empty_para = write_empty_para
        self.__run_level = run_level
        self.__write_to = better_mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
        'mi<mk<pict-start'      : self.__start_para_func,
        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
        }
+
    def __before_body_func(self, line):
        """
        Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'not_paragraph'
        self.__write_obj.write(line)
+
    def __not_paragraph_func(self, line):
        """
        Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
        if action:
            action(line)
        self.__write_obj.write(line)
+
    def __paragraph_func(self, line):
        """
        Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __start_para_func(self, line):
        """
        Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
        )
        self.__write_obj.write(self.__start2_marker)
        self.__state = 'paragraph'
+
    def __empty_para_func(self, line):
        """
        Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
            'mi<tg<empty_____<para\n'
            )
            self.__write_obj.write(self.__end_marker)   # marker for later parsing
+
    def __empty_pgbk_func(self, line):
        """
        Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(
        'mi<tg<empty_____<page-break\n'
        )
+
    def __close_para_func(self, line):
        """
        Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(self.__end_marker) # marker for later parser
        self.__write_obj.write(line)
        self.__state = 'not_paragraph'
+
    def __bogus_para__def_func(self, line):
        """
        Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
            if a \pard occurs in a paragraph, I want to ignore it. (I believe)
        """
        self.__write_obj.write('mi<mk<bogus-pard\n')
+
    def make_paragraphs(self):
        """
        Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
            only other state is 'paragraph'.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
                    self.__token_info = line[:16]
                    action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module sections.py\n')
+                    if action is None:
+                        try:
+                            sys.stderr.write('no matching state in module paragraphs.py\n')
                            sys.stderr.write(self.__state + '\n')
+                        except:
+                            pass
                    action(line)
-        read_obj.close()
-        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "paragraphs.data")
--- a/src/calibre/ebooks/rtf2xml/preamble_rest.py
+++ b/src/calibre/ebooks/rtf2xml/preamble_rest.py
@ -11,7 +11,9 @@
 #                                                                       #
 #########################################################################
 import sys,os
+
 from calibre.ebooks.rtf2xml import copy
+
 class Preamble:
    """
    Fix the reamaing parts of the preamble. This module does very little. It
@ -19,8 +21,14 @@ class Preamble:
    future, when I understand how to interpret the revision table and list
    table, I will make these methods more functional.
    """
-    def __init__(self, file, bug_handler,  platform, default_font, code_page,
-    copy=None, temp_dir=None):
+    def __init__(self, file,
+                bug_handler,
+                platform,
+                default_font,
+                code_page,
+                copy=None,
+                temp_dir=None,
+                ):
        """
        Required:
            file--file to parse
@ -44,6 +52,7 @@ class Preamble:
            self.__write_to = os.path.join(temp_dir,"info_table_info.data")
        else:
            self.__write_to = "info_table_info.data"
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
        'mi<mk<revtbl-beg'      : self.__found_revision_table_func,
        'mi<mk<body-open_'      : self.__found_body_func,
        }
+
    def __default_func(self, line):
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __found_rtf_head_func(self, line):
        """
        Requires:
@ -84,8 +95,10 @@ class Preamble:
            '<platform>%s\n' % (self.__default_font, self.__code_page,
            self.__platform)
        )
+
    def __found_list_table_func(self, line):
        self.__state = 'list_table'
+
    def __list_table_func(self, line):
        if self.__token_info == 'mi<mk<listabend_':
            self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_revision_table_func(self, line):
        self.__state = 'revision'
+
    def __revision_table_func(self, line):
        if self.__token_info == 'mi<mk<revtbl-end':
            self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
+
    def __body_func(self, line):
        self.__write_obj.write(line)
+
    def fix_preamble(self):
        """
        Requires:
@ -119,20 +137,15 @@ class Preamble:
            the list table.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
                    self.__token_info = line[:16]
                    action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module preamble_rest.py\n')
-                sys.stderr.write(self.__state + '\n')
+                    if action is None:
+                        sys.stderr.write(
+                        'no matching state in module preamble_rest.py\n' + self.__state + '\n')
                    action(line)
-        read_obj.close()
-        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_div.data")
--- a/src/calibre/ebooks/rtf2xml/sections.py
+++ b/src/calibre/ebooks/rtf2xml/sections.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

@ -33,7 +34,7 @@ tokens and write the section tags.
    The exception to this method occurs when sections occur in field blocks, such
    as the index. Normally, two section break occur within the index and other
    field-blocks. (If less or more section breaks occurr, this code may not work.)
-I want the sections to occurr outside of the index. That is, the index
+    I want the sections to occur outside of the index. That is, the index
    should be nested inside one section tag. After the index is complete, a new
    section should begin.
    In order to write the sections outside of the field blocks, I have to store