RTF Input: Handle old RTF files that have commands without braces. Fixes #994133 (Private bug)

2025-07-09 03:04:10 -04:00 · 2012-05-13 07:55:09 +05:30 · 2012-05-13 07:55:09 +05:30 · fb94b02be3
commit fb94b02be3
parent e638c9f3f3 6acc92af67
11 changed files with 14057 additions and 381 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -372,8 +372,8 @@ class ParseRtf:
        old_rtf = old_rtf_obj.check_if_old_rtf()
        if old_rtf:
            if self.__run_level > 5:
-                msg = 'Older RTF\n'
-                msg += 'self.__run_level is "%s"\n' % self.__run_level
+                msg = 'Older RTF\n' \
+                'self.__run_level is "%s"\n' % self.__run_level
                raise RtfInvalidCodeException, msg
            if self.__run_level > 1:
                sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
                if self.__run_level > 1:
                    sys.stderr.write(
                        'File also has newer RTF.\n'
-                        'Will do the best to convert.\n'
+                        'Will do the best to convert...\n'
                    )
            add_brackets_obj = add_brackets.AddBrackets(
                    in_file = self.__temp_file,
--- a/src/calibre/ebooks/rtf2xml/add_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/add_brackets.py
@ -20,6 +20,9 @@ class AddBrackets:
    """
    Add brackets for old RTF.
    Logic:
+    When control words without their own brackets are encountered
+    and in the list of allowed words, this will add brackets
+    to facilitate the treatment of the file
    """
    def __init__(self, in_file,
            bug_handler,
@ -41,53 +44,56 @@ class AddBrackets:
        self.__copy = copy
        self.__write_to = better_mktemp()
        self.__run_level = run_level
-
-    def __initiate_values(self):
-        """
-        """
        self.__state_dict = {
            'before_body'           : self.__before_body_func,
            'in_body'               : self.__in_body_func,
            'after_control_word'    : self.__after_control_word_func,
            'in_ignore'             : self.__ignore_func,
        }
+        self.__accept = [
+            'cw<ci<bold______' ,
+            'cw<ci<annotation' ,
+            'cw<ci<blue______' ,
+            # 'cw<ci<bold______' ,
+            'cw<ci<caps______' ,
+            'cw<ci<char-style' ,
+            'cw<ci<dbl-strike' ,
+            'cw<ci<emboss____' ,
+            'cw<ci<engrave___' ,
+            'cw<ci<font-color' ,
+            'cw<ci<font-down_' ,
+            'cw<ci<font-size_' ,
+            'cw<ci<font-style' ,
+            'cw<ci<font-up___' ,
+            'cw<ci<footnot-mk' ,
+            'cw<ci<green_____' ,
+            'cw<ci<hidden____' ,
+            'cw<ci<italics___' ,
+            'cw<ci<outline___' ,
+            'cw<ci<red_______' ,
+            'cw<ci<shadow____' ,
+            'cw<ci<small-caps' ,
+            'cw<ci<strike-thr' ,
+            'cw<ci<subscript_' ,
+            'cw<ci<superscrip' ,
+            'cw<ci<underlined' ,
+            # 'cw<ul<underlined' ,
+        ]
+
+    def __initiate_values(self):
+        """
+        Init temp values
+        """
        self.__state = 'before_body'
        self.__inline = {}
        self.__temp_group = []
-        self.__open_bracket = 0
-        self.__found_brackets = 0
-        self.__accept = [
-        'cw<ci<bold______',
-        'cw<ci<annotation'  ,
-        'cw<ci<blue______' ,
-        'cw<ci<bold______' ,
-        'cw<ci<caps______' ,
-        'cw<ci<char-style' ,
-        'cw<ci<dbl-strike' ,
-        'cw<ci<emboss____'  ,
-        'cw<ci<engrave___' ,
-        'cw<ci<font-color' ,
-        'cw<ci<font-down_' ,
-        'cw<ci<font-size_' ,
-        'cw<ci<font-style' ,
-        'cw<ci<font-up___',
-        'cw<ci<footnot-mk',
-        'cw<ci<green_____' ,
-        'cw<ci<hidden____',
-        'cw<ci<italics___' ,
-        'cw<ci<outline___',
-        'cw<ci<red_______' ,
-        'cw<ci<shadow____',
-        'cw<ci<small-caps' ,
-        'cw<ci<strike-thr',
-        'cw<ci<subscript_' ,
-        'cw<ci<superscrip',
-        'cw<ci<underlined' ,
-        # 'cw<ul<underlined' ,
-        ]
+        self.__open_bracket = False
+        self.__found_brackets = False
+        

    def __before_body_func(self, line):
        """
+        If we are before the body, not interest in changing anything
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
@ -95,6 +101,14 @@ class AddBrackets:

    def __in_body_func(self, line):
        """
+        Select what action to take in body:
+            1-At the end of the file close the braket if a bracket was opened
+            This happens if there is achange
+            2-If an open bracket is found the code inside is ignore
+            (written without modifications)
+            3-If an accepted control word is found put the line
+            in a buffer then chage state to after cw
+            4-Else simply write the line
        """
        if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
            self.__write_obj.write(
@ -102,7 +116,7 @@ class AddBrackets:
                    )
            self.__write_obj.write(line)
        elif self.__token_info == 'ob<nu<open-brack':
-            self.__found_brackets = 1
+            self.__found_brackets = True
            self.__state = 'in_ignore'
            self.__ignore_count = self.__ob_count
            self.__write_obj.write(line)
@ -114,6 +128,10 @@ class AddBrackets:

    def __after_control_word_func(self, line):
        """
+        After a cw either add next allowed cw to temporary list or
+        change groupe and write it.
+        If the token leading to an exit is an open bracket go to
+        ignore otherwise goto in body
        """
        if self.__token_info in self.__accept:
            self.__temp_group.append(line)
@ -129,82 +147,84 @@ class AddBrackets:

    def __write_group(self):
        """
+        Write a tempory group after accepted control words end
+        But this is mostly useless in my opinion as there is no list of rejected cw
+        This may be a way to implement future old rtf processing for cw
+        Utility: open a group to just put brackets but why be so complicated?
+        Scheme: open brackets, write cw then go to body and back with cw after 
        """
        if self.__open_bracket:
            self.__write_obj.write(
                'cb<nu<clos-brack<0003\n'
                )
-            self.__open_bracket = 0
-        inline_string = ''
-        the_keys = self.__inline.keys()
-        for the_key in the_keys:
-            value = self.__inline[the_key]
-            if value != 'false':
-                inline_string += '%s<nu<%s\n' % (the_key, value)
+            self.__open_bracket = False
+
+        inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
+                for k, v in self.__inline.iteritems() \
+                    if v != 'false'])
        if inline_string:
-            self.__write_obj.write('ob<nu<open-brack<0003\n')
-            self.__write_obj.write(inline_string)
-            self.__open_bracket = 1
+            self.__write_obj.write('ob<nu<open-brack<0003\n'
+                '%s' % inline_string)
+            self.__open_bracket = True
        self.__temp_group = []

    def __change_permanent_group(self):
        """
-        use temp group to change permanent group
+        Use temp group to change permanent group
+        If the control word is not accepted remove it
+        What is the interest as it is build to accept only accepted cw
+        in __after_control_word_func?
        """
-        for line in self.__temp_group:
-            token_info = line[:16]
-            if token_info in self.__accept:
-                att = line[20:-1]
-                self.__inline[token_info] = att
+        self.__inline = {line[:16] : line[20:-1]\
+            for line in self.__temp_group\
+            # Is this really necessary?
+                if line[:16] in self.__accept}
+

    def __ignore_func(self, line):
        """
-        Don't add any brackets while inside of brackets RTF has already
-        added.
+        Just copy data inside of RTF brackets already here.
        """
        self.__write_obj.write(line)
-        if self.__token_info == 'cb<nu<clos-brack'and\
-            self.__cb_count == self.__ignore_count:
+        if self.__token_info == 'cb<nu<clos-brack'\
+            and self.__cb_count == self.__ignore_count:
            self.__state = 'in_body'

    def __check_brackets(self, in_file):
-        self.__check_brack_obj = check_brackets.CheckBrackets\
+        """
+        Return True if brackets match
+        """
+        check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
-        good_br =  self.__check_brack_obj.check_brackets()[0]
-        if not good_br:
-            return 1
+        return check_brack_obj.check_brackets()[0]

    def add_brackets(self):
        """
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('No matching state in module add_brackets.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
-        bad_brackets = self.__check_brackets(self.__write_to)
-        if not bad_brackets:
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                            'No matching state in module add_brackets.py\n'
+                            '%s\n' % self.__state)
+                    action(line)
+        #Check bad brackets
+        if self.__check_brackets(self.__write_to):
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            if self.__copy:
                copy_obj.copy_file(self.__write_to, "add_brackets.data")
-            copy_obj.rename(self.__write_to, self.__file)
+            copy_obj.rename(self.__write_to, self.__file)  
        else:
            if self.__run_level > 0:
                sys.stderr.write(
                    'Sorry, but this files has a mix of old and new RTF.\n'
                    'Some characteristics cannot be converted.\n')
-        os.remove(self.__write_to)
+        os.remove(self.__write_to)
--- a/src/calibre/ebooks/rtf2xml/char_set.py
+++ b/src/calibre/ebooks/rtf2xml/char_set.py
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@ -1,4 +1,5 @@
 import os, sys
+from codecs import EncodedFile

 from calibre.ebooks.rtf2xml import copy, check_encoding
 from calibre.ptempfile import better_mktemp
@ -41,6 +42,7 @@ class ConvertToTags:
        self.__run_level = run_level
        self.__write_to = better_mktemp()
        self.__convert_utf = False
+        self.__bad_encoding = False

    def __initiate_values(self):
        """
@ -213,13 +215,14 @@ class ConvertToTags:

        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
-        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
            self.__convert_utf = True
        else:
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
                    ' hope for the best')
+            self.__bad_encoding = True
        self.__new_line = 0
        self.__write_new_line()
        if self.__no_dtd:
@ -247,7 +250,7 @@ class ConvertToTags:
        the appropriate function.
        The functions that are called:
            a text function for text
-            an open funciton for open tags
+            an open function for open tags
            an open with attribute function for tags with attributes
            an empty with attribute function for tags that are empty but have
            attribtes.
@ -263,20 +266,19 @@ class ConvertToTags:
                    action = self.__state_dict.get(self.__token_info)
                    if action is not None:
                        action(line)
-        self.__write_obj.close()
-        #convert all encodings to UTF8 to avoid unsupported encodings in lxml
-        if self.__convert_utf:
+        #convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
+        if self.__convert_utf or self.__bad_encoding:
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            copy_obj.rename(self.__write_to, self.__file)
+            file_encoding = "utf-8"
+            if self.__bad_encoding:
+                file_encoding = "us-ascii"
            with open(self.__file, 'r') as read_obj:
                with open(self.__write_to, 'w') as write_obj:
-                    file = read_obj.read()
-                    try:
-                        file = file.decode(self.__encoding)
-                        write_obj.write(file.encode('utf-8'))
-                    except:
-                        sys.stderr.write('Conversion to UTF-8 is not possible,'
-                        ' encoding should be very carefully checked')
+                    write_objenc = EncodedFile(write_obj, self.__encoding,
+                                    file_encoding, 'replace')
+                    for line in read_obj:
+                        write_objenc.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
--- a/src/calibre/ebooks/rtf2xml/header.py
+++ b/src/calibre/ebooks/rtf2xml/header.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

@ -31,29 +32,29 @@ class Header:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = better_mktemp()
-        self.__found_a_header = 0
+        self.__found_a_header = False
+
    def __in_header_func(self, line):
        """
        Handle all tokens that are part of header
        """
        if self.__cb_count == self.__header_bracket_count:
-            self.__in_header = 0
+            self.__in_header = False
            self.__write_obj.write(line)
            self.__write_to_head_obj.write(
-            'mi<mk<head___clo\n')
-            self.__write_to_head_obj.write(
-            'mi<tg<close_____<header-or-footer\n')
-            self.__write_to_head_obj.write(
+            'mi<mk<head___clo\n' \
+            'mi<tg<close_____<header-or-footer\n' \
            'mi<mk<header-clo\n')
        else:
            self.__write_to_head_obj.write(line)
+
    def __found_header(self, line):
        """
        Found a header
        """
        # but this could be header or footer
-        self.__found_a_header = 1
-        self.__in_header = 1
+        self.__found_a_header = True
+        self.__in_header = True
        self.__header_count += 1
        # temporarily set this to zero so I can enter loop
        self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
                    'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
                    )
        else:
-            sys.stderr.write('module is header\n')
-            sys.stderr.write('method is __found_header\n')
-            sys.stderr.write('no dict entry\n')
-            sys.stderr.write('line is %s' % line)
+            sys.stderr.write(
+            'module is header\n' \
+            'method is __found_header\n' \
+            'no dict entry\n' \
+            'line is %s' % line)
            self.__write_to_head_obj.write(
                    'mi<tg<open-att__<header-or-footer<type>none\n'
                    )
+
    def __default_sep(self, line):
-        """Handle all tokens that are not header tokens"""
+        """
+        Handle all tokens that are not header tokens
+        """
        if self.__token_info[3:5] == 'hf':
            self.__found_header(line)
        self.__write_obj.write(line)
+
    def __initiate_sep_values(self):
        """
        initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__header_bracket_count = 0
-        self.__in_header = 0
+        self.__in_header = False
        self.__header_count = 0
        self.__head_dict = {
            'head-left_'        :   ('header-left'),
@ -101,6 +107,7 @@ class Header:
            'header____'        :   ('header' ),
            'footer____'        :   ('footer' ),
        }
+
    def separate_headers(self):
        """
        Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,53 +117,47 @@ class Header:
        bottom of the main file.
        """
        self.__initiate_sep_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
        self.__header_holder = better_mktemp()
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            # keep track of opening and closing brackets
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            # In the middle of footnote text
-            if self.__in_header:
-                self.__in_header_func(line)
-            # not in the middle of footnote text
-            else:
-                self.__default_sep(line)
-        self.__write_obj.close()
-        read_obj.close()
-        self.__write_to_head_obj.close()
-        read_obj = open(self.__header_holder, 'r')
-        write_obj = open(self.__write_to, 'a')
-        write_obj.write(
-        'mi<mk<header-beg\n')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        write_obj.write(
-        'mi<mk<header-end\n')
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_header:
+                            self.__in_header_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+        
+        with open(self.__header_holder, 'r') as read_obj:
+            with open(self.__write_to, 'a') as write_obj:
+                write_obj.write(
+                'mi<mk<header-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<header-end\n')
        os.remove(self.__header_holder)
+
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "header_separate.info")
+            copy_obj.copy_file(self.__write_to, "header_separate.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def update_info(self, file, copy):
        """
        Unused method
        """
        self.__file = file
        self.__copy = copy
+
    def __get_head_body_func(self, line):
        """
        Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
            self.__state = 'head'
        else:
            self.__write_obj.write(line)
+
    def __get_head_head_func(self, line):
        """
        Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
            self.__state = 'body'
        else:
            self.__write_to_head_obj.write(line)
+
    def __get_headers(self):
        """
        Private method to remove footnotes from main file.  Read one line from
@ -182,21 +185,16 @@ class Header:
        These two functions do the work of separating the footnotes form the
        body.
        """
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-            # self.__write_to = "footnote_info.data"
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__state == 'body':
-                self.__get_head_body_func(line)
-            elif self.__state == 'head':
-                self.__get_head_head_func(line)
-        read_obj.close()
-        self.__write_obj.close()
-        self.__write_to_head_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_head_body_func(line)
+                        elif self.__state == 'head':
+                            self.__get_head_head_func(line)
+
    def __get_head_from_temp(self, num):
        """
        Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
        returns them as a string.
        """
        look_for = 'mi<mk<header-ope<' + num + '\n'
-        found_head = 0
+        found_head = False
        string_to_return = ''
-        line = 1
-        while line:
-            line = self.__read_from_head_obj.readline()
+        for line in self.__read_from_head_obj:
            if found_head:
                if line == 'mi<mk<header-clo\n':
                    return string_to_return
-                string_to_return = string_to_return + line
+                string_to_return += line
            else:
                if line == look_for:
-                    found_head = 1
+                    found_head = True
+
    def __join_from_temp(self):
        """
        Private method for rejoining footnotes to body.  Read from the
@ -227,15 +224,13 @@ class Header:
        If no footnote marker is found, simply print out the token (line).
        """
        self.__read_from_head_obj = open(self.__header_holder, 'r')
-        read_obj = open(self.__write_to, 'r')
        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if line[:16] == 'mi<mk<header-ind':
-                line = self.__get_head_from_temp(line[17:-1])
-            self.__write_obj.write(line)
-        read_obj.close()
+        with open(self.__write_to, 'r') as read_obj:
+            for line in read_obj:
+                if line[:16] == 'mi<mk<header-ind':
+                    line = self.__get_head_from_temp(line[17:-1])
+                self.__write_obj.write(line)
+
    def join_headers(self):
        """
        Join the footnotes from the bottom of the file and put them in their
--- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
+++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
@ -181,7 +181,7 @@ class Hex2Utf8:
            self.__dingbats_dict.update(dingbats_base_dict)
            self.__dingbats_dict.update(ms_dingbats_dict)
        # load dictionary for caps, and make a string for the replacement
-        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
+        self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
        # # print self.__caps_uni_dict
        # don't think I'll need this
        ##keys = self.__caps_uni_dict.keys()
--- a/src/calibre/ebooks/rtf2xml/old_rtf.py
+++ b/src/calibre/ebooks/rtf2xml/old_rtf.py
@ -11,14 +11,18 @@
 #                                                                       #
 #########################################################################
 import sys
-"""
-"""
+
 class OldRtf:
    """
    Check to see if the RTF is an older version
    Logic:
+    If allowable control word/properties happen in text without being enclosed
+    in brackets the file will be considered old rtf
    """
-    def __init__(self, in_file, bug_handler, run_level ):
+    def __init__(self, in_file,
+                bug_handler,
+                run_level,
+                ):
        """
        Required:
            'file'--file to parse
@ -32,46 +36,46 @@ class OldRtf:
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
-        self.__initiate_values()
-        self.__ob_group = 0
-    def __initiate_values(self):
-        self.__previous_token = ''
-        self.__new_found = 0
+        self.__run_level = run_level
        self.__allowable = [
-        'annotation' ,
-        'blue______'  ,
-        'bold______',
-        'caps______',
-        'char-style' ,
-        'dbl-strike' ,
-        'emboss____',
-        'engrave___' ,
-        'font-color',
-        'font-down_' ,
-        'font-size_',
-        'font-style',
-        'font-up___',
-        'footnot-mk' ,
-        'green_____' ,
-        'hidden____',
-        'italics___',
-        'outline___',
-        'red_______',
-        'shadow____' ,
-        'small-caps',
-        'strike-thr',
-        'subscript_',
-        'superscrip' ,
-        'underlined' ,
+            'annotation' ,
+            'blue______'  ,
+            'bold______',
+            'caps______',
+            'char-style' ,
+            'dbl-strike' ,
+            'emboss____',
+            'engrave___' ,
+            'font-color',
+            'font-down_' ,
+            'font-size_',
+            'font-style',
+            'font-up___',
+            'footnot-mk' ,
+            'green_____' ,
+            'hidden____',
+            'italics___',
+            'outline___',
+            'red_______',
+            'shadow____' ,
+            'small-caps',
+            'strike-thr',
+            'subscript_',
+            'superscrip' ,
+            'underlined' ,
        ]
-        self.__state = 'before_body'
        self.__action_dict = {
            'before_body'   : self.__before_body_func,
            'in_body'       : self.__check_tokens_func,
            'after_pard'    : self.__after_pard_func,
        }
-        self.__is_old = 0
+
+    def __initiate_values(self):
+        self.__previous_token = ''
+        self.__state = 'before_body'
        self.__found_new = 0
+        self.__ob_group = 0
+
    def __check_tokens_func(self, line):
        if self.__inline_info in self.__allowable:
            if self.__ob_group == self.__base_ob_count:
@ -80,48 +84,56 @@ class OldRtf:
                self.__found_new += 1
        elif self.__token_info ==  'cw<pf<par-def___':
            self.__state = 'after_pard'
+
    def __before_body_func(self, line):
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
            self.__base_ob_count = self.__ob_group
+
    def __after_pard_func(self, line):
        if line[0:2] != 'cw':
            self.__state = 'in_body'
+
    def check_if_old_rtf(self):
        """
        Requires:
            nothing
        Returns:
-            1 if file is older RTf
-            0 if file is newer RTF
+            True if file is older RTf
+            False if file is newer RTF
        """
-
-        read_obj = open(self.__file, 'r')
-        line = 1
+        self.__initiate_values()
        line_num = 0
-        while line:
-            line = read_obj.readline()
-            line_num += 1
-            self.__token_info = line[:16]
-            if self.__token_info == 'mi<mk<body-close':
-                return 0
-                self.__ob_group = 0
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_group += 1
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__ob_group -= 1
-                self.__cb_count = line[-5:-1]
-            self.__inline_info = line[6:16]
-            if self.__state == 'after_body':
-                return 0
-            action = self.__action_dict.get(self.__state)
-            if not action:
-                sys.stderr.write('No action for state!\n')
-            result = action(line)
-            if result == 'new_rtf':
-                return 0
-            elif result == 'old_rtf':
-                return 1
-            self.__previous_token = line[6:16]
-        return 0
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                line_num += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'mi<mk<body-close':
+                    return False
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.__ob_group += 1
+                    self.__ob_count = line[-5:-1]
+                if self.__token_info == 'cb<nu<clos-brack':
+                    self.__ob_group -= 1
+                    self.__cb_count = line[-5:-1]
+                self.__inline_info = line[6:16]
+                if self.__state == 'after_body':
+                    return False
+                action = self.__action_dict.get(self.__state)
+                if action is None:
+                    try:
+                        sys.stderr.write('No action for this state!\n')
+                    except:
+                        pass
+                result = action(line)
+                if result == 'new_rtf':
+                    return False
+                elif result == 'old_rtf':
+                    if self.__run_level > 3:
+                        sys.stderr.write(
+                            'Old rtf construction %s (bracket %s, line %s)\n' 
+                                % (self.__inline_info, str(self.__ob_group), line_num)
+                        )
+                    return True
+                self.__previous_token = line[6:16]
+        return False
--- a/src/calibre/ebooks/rtf2xml/output.py
+++ b/src/calibre/ebooks/rtf2xml/output.py
@ -10,7 +10,9 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, codecs
+import sys, os
+# , codecs
+
 class Output:
    """
    Output file
@ -19,7 +21,8 @@ class Output:
            file,
            orig_file,
            output_dir = None,
-            out_file = None
+            out_file = None,
+            no_ask = True
            ):
        """
        Required:
@ -33,8 +36,9 @@ class Output:
        self.__file = file
        self.__orig_file = orig_file
        self.__output_dir = output_dir
-        self.__no_ask = 1
+        self.__no_ask = no_ask
        self.__out_file = out_file
+
    def output(self):
        """
        Required:
@ -45,13 +49,14 @@ class Output:
            output the line to the screen if no output file given. Otherwise, output to
            the file.
        """
-        # self.__output_xml(self.__file, self.__out_file)
        if self.__output_dir:
            self.__output_to_dir_func()
        elif self.__out_file:
-            self.__output_xml(self.__file, self.__out_file)
+            self.__output_to_file_func()
+            # self.__output_xml(self.__file, self.__out_file)
        else:
            self.__output_to_standard_func()
+
    def __output_to_dir_func(self):
        """
        Requires:
@ -64,32 +69,25 @@ class Output:
        """
        base_name = os.path.basename(self.__orig_file)
        base_name, ext  = os.path.splitext(base_name)
-        output_file = '%s.xml' % base_name
-        output_file = os.path.join(self.__output_dir, output_file)
+        output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
        # change if user wants to output to a specific file
        if self.__out_file:
            output_file = os.path.join(self.__output_dir, self.__out_file)
        user_response = 'o'
-        if os.path.isfile(output_file):
-            if self.__no_ask:
-                user_response = 'o'
-            else:
-                msg = 'Do you want to over-write %s?\n' % output_file
-                msg += 'Type "o" to over-write.\n'
-                msg += 'Type any other key to print to standard output.\n'
-                sys.stderr.write(msg)
-                user_response = raw_input()
+        if os.path.isfile(output_file) and not self.__no_ask:
+            msg = 'Do you want to overwrite %s?\n' % output_file
+            msg += ('Type "o" to overwrite.\n'
+                    'Type any other key to print to standard output.\n')
+            sys.stderr.write(msg)
+            user_response = raw_input()
        if user_response == 'o':
-            read_obj = open(self.__file, 'r')
-            write_obj = open(output_file, 'w')
-            line = 1
-            while line:
-                line = read_obj.readline()
-                write_obj.write(line)
-            read_obj.close()
-            write_obj.close()
+            with open(self.__file, 'r') as read_obj:
+                with open(self.output_file, 'w') as write_obj:
+                    for line in read_obj:
+                        write_obj.write(line)
        else:
            self.__output_to_standard_func()
+
    def __output_to_file_func(self):
        """
        Required:
@ -99,14 +97,11 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__out_file, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__out_file, 'w') as write_obj:
+                for line in read_obj:
+                    write_obj.write(line)
+
    def __output_to_standard_func(self):
        """
        Required:
@ -116,26 +111,24 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            sys.stdout.write(line)
-        read_obj.close()
-    def __output_xml(self, in_file, out_file):
-        """
-        output the ill-formed xml file
-        """
-        (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
-        write_obj = utf8_writer(open(out_file, 'w'))
-        write_obj = open(out_file, 'w')
-        read_obj = utf8_writer(open(in_file, 'r'))
-        read_obj = open(in_file, 'r')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if isinstance(line, type(u"")):
-                line = line.encode("utf-8")
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                sys.stdout.write(line)
+
+    # def __output_xml(self, in_file, out_file):
+        # """
+        # output the ill-formed xml file
+        # """
+        # (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
+        # write_obj = utf8_writer(open(out_file, 'w'))
+        # write_obj = open(out_file, 'w')
+        # read_obj = utf8_writer(open(in_file, 'r'))
+        # read_obj = open(in_file, 'r')
+        # line = 1
+        # while line:
+            # line = read_obj.readline()
+            # if isinstance(line, type(u"")):
+                # line = line.encode("utf-8")
+            # write_obj.write(line)
+        # read_obj.close()
+        # write_obj.close()
--- a/src/calibre/ebooks/rtf2xml/paragraphs.py
+++ b/src/calibre/ebooks/rtf2xml/paragraphs.py
@ -11,31 +11,32 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

 class Paragraphs:
    """
-=================
-Purpose
-=================
-Write paragraph tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
-------------
-Method
-------------
-RTF does not tell you when a paragraph begins. It only tells you when the
-paragraph ends.
-In order to make paragraphs out of this limited info, the parser starts in the
-body of the documents and assumes it is not in a paragraph. It looks for clues
-to begin a paragraph. Text starts a paragraph; so does an inline field or
-list-text. If an end of paragraph marker (\par) is found, then this indicates
-a blank paragraph.
-Once a paragraph is found, the state changes to 'paragraph.' In this state,
-clues are looked to for the end of a paragraph. The end of a paragraph marker
-(\par) marks the end of a paragraph. So does the end of a footnote or heading;
-a paragraph definintion; the end of a field-block; and the beginning of a
-section. (How about the end of a section or the end of a field-block?)
+    =================
+    Purpose
+    =================
+    Write paragraph tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    -------------
+    Method
+    -------------
+    RTF does not tell you when a paragraph begins. It only tells you when the
+    paragraph ends.
+    In order to make paragraphs out of this limited info, the parser starts in the
+    body of the documents and assumes it is not in a paragraph. It looks for clues
+    to begin a paragraph. Text starts a paragraph; so does an inline field or
+    list-text. If an end of paragraph marker (\par) is found, then this indicates
+    a blank paragraph.
+    Once a paragraph is found, the state changes to 'paragraph.' In this state,
+    clues are looked to for the end of a paragraph. The end of a paragraph marker
+    (\par) marks the end of a paragraph. So does the end of a footnote or heading;
+    a paragraph definition; the end of a field-block; and the beginning of a
+    section. (How about the end of a section or the end of a field-block?)
    """
    def __init__(self,
            in_file,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_empty_para = write_empty_para
        self.__run_level = run_level
        self.__write_to = better_mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__paragraph_dict = {
        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
-        ##'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
+        ## 'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
        'mi<mk<body-close'      : self.__close_para_func,   # end of body
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
        'mi<mk<pict-start'      : self.__start_para_func,
        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
        }
+
    def __before_body_func(self, line):
        """
        Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'not_paragraph'
        self.__write_obj.write(line)
+
    def __not_paragraph_func(self, line):
        """
        Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
        if action:
            action(line)
        self.__write_obj.write(line)
+
    def __paragraph_func(self, line):
        """
        Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __start_para_func(self, line):
        """
        Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
        )
        self.__write_obj.write(self.__start2_marker)
        self.__state = 'paragraph'
+
    def __empty_para_func(self, line):
        """
        Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
            'mi<tg<empty_____<para\n'
            )
            self.__write_obj.write(self.__end_marker)   # marker for later parsing
+
    def __empty_pgbk_func(self, line):
        """
        Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(
        'mi<tg<empty_____<page-break\n'
        )
+
    def __close_para_func(self, line):
        """
        Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(self.__end_marker) # marker for later parser
        self.__write_obj.write(line)
        self.__state = 'not_paragraph'
+
    def __bogus_para__def_func(self, line):
        """
        Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
            if a \pard occurs in a paragraph, I want to ignore it. (I believe)
        """
        self.__write_obj.write('mi<mk<bogus-pard\n')
+
    def make_paragraphs(self):
        """
        Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
            only other state is 'paragraph'.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module sections.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        try:
+                            sys.stderr.write('no matching state in module paragraphs.py\n')
+                            sys.stderr.write(self.__state + '\n')
+                        except:
+                            pass
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "paragraphs.data")
--- a/src/calibre/ebooks/rtf2xml/preamble_rest.py
+++ b/src/calibre/ebooks/rtf2xml/preamble_rest.py
@ -11,16 +11,24 @@
 #                                                                       #
 #########################################################################
 import sys,os
+
 from calibre.ebooks.rtf2xml import copy
+
 class Preamble:
    """
    Fix the reamaing parts of the preamble. This module does very little. It
    makes sure that no text gets put in the revision of list table. In the
-    future, when I understand how to interprett he revision table and list
+    future, when I understand how to interpret the revision table and list
    table, I will make these methods more functional.
    """
-    def __init__(self, file, bug_handler,  platform, default_font, code_page,
-    copy=None, temp_dir=None):
+    def __init__(self, file,
+                bug_handler,
+                platform,
+                default_font,
+                code_page,
+                copy=None,
+                temp_dir=None,
+                ):
        """
        Required:
            file--file to parse
@ -44,6 +52,7 @@ class Preamble:
            self.__write_to = os.path.join(temp_dir,"info_table_info.data")
        else:
            self.__write_to = "info_table_info.data"
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
        'mi<mk<revtbl-beg'      : self.__found_revision_table_func,
        'mi<mk<body-open_'      : self.__found_body_func,
        }
+
    def __default_func(self, line):
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __found_rtf_head_func(self, line):
        """
        Requires:
@ -84,8 +95,10 @@ class Preamble:
            '<platform>%s\n' % (self.__default_font, self.__code_page,
            self.__platform)
        )
+
    def __found_list_table_func(self, line):
        self.__state = 'list_table'
+
    def __list_table_func(self, line):
        if self.__token_info == 'mi<mk<listabend_':
            self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_revision_table_func(self, line):
        self.__state = 'revision'
+
    def __revision_table_func(self, line):
        if self.__token_info == 'mi<mk<revtbl-end':
            self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
+
    def __body_func(self, line):
        self.__write_obj.write(line)
+
    def fix_preamble(self):
        """
        Requires:
@ -119,20 +137,15 @@ class Preamble:
            the list table.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module preamble_rest.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                        'no matching state in module preamble_rest.py\n' + self.__state + '\n')
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_div.data")
--- a/src/calibre/ebooks/rtf2xml/sections.py
+++ b/src/calibre/ebooks/rtf2xml/sections.py
@ -11,43 +11,44 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

 class Sections:
    """
-=================
-Purpose
-=================
-Write section tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
---------------
-logic
---------------
-The tags for the first section breaks have already been written.
-RTF stores section breaks with the \sect tag. Each time this tag is
-encountered, add one to the counter.
-When I encounter the \sectd tag, I want to collect all the appropriate tokens
-that describe the section. When I reach a \pard, I know I an stop collecting
-tokens and write the section tags.
-The exception to this method occurs when sections occur in field blocks, such
-as the index. Normally, two section break occur within the index and other
-field-blocks. (If less or more section breaks occurr, this code may not work.)
-I want the sections to occurr outside of the index. That is, the index
-should be nested inside one section tag. After the index is complete, a new
-section should begin.
-In order to write the sections outside of the field blocks, I have to store
-all of the field block as a string. When I ecounter the \sect tag, add one to
-the section counter, but store this number in a list. Likewise, store the
-information describing the section in another list.
-When I reach the end of the field block, choose the first item from the
-numbered list as the section number. Choose the first item in the description
-list as the values and attributes of the section. Enclose the field string
-between the section tags.
-Start a new section outside the field-block strings. Use the second number in
-the list; use the second item in the description list.
-CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
-Instead, ingore all section information in a field-block.
+    =================
+    Purpose
+    =================
+    Write section tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    ---------------
+    logic
+    ---------------
+    The tags for the first section breaks have already been written.
+    RTF stores section breaks with the \sect tag. Each time this tag is
+    encountered, add one to the counter.
+    When I encounter the \sectd tag, I want to collect all the appropriate tokens
+    that describe the section. When I reach a \pard, I know I an stop collecting
+    tokens and write the section tags.
+    The exception to this method occurs when sections occur in field blocks, such
+    as the index. Normally, two section break occur within the index and other
+    field-blocks. (If less or more section breaks occurr, this code may not work.)
+    I want the sections to occur outside of the index. That is, the index
+    should be nested inside one section tag. After the index is complete, a new
+    section should begin.
+    In order to write the sections outside of the field blocks, I have to store
+    all of the field block as a string. When I ecounter the \sect tag, add one to
+    the section counter, but store this number in a list. Likewise, store the
+    information describing the section in another list.
+    When I reach the end of the field block, choose the first item from the
+    numbered list as the section number. Choose the first item in the description
+    list as the values and attributes of the section. Enclose the field string
+    between the section tags.
+    Start a new section outside the field-block strings. Use the second number in
+    the list; use the second item in the description list.
+    CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
+    Instead, ingore all section information in a field-block.
    """
    def __init__(self,
            in_file,