RTF Input: Handle old RTF files that have commands without braces. Fixes #994133 (Private bug)

2025-07-09 03:04:10 -04:00 · 2012-05-13 07:55:09 +05:30 · 2012-05-13 07:55:09 +05:30 · fb94b02be3
commit fb94b02be3
parent e638c9f3f3 6acc92af67
11 changed files with 14057 additions and 381 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -372,8 +372,8 @@ class ParseRtf:
        old_rtf = old_rtf_obj.check_if_old_rtf()
        if old_rtf:
            if self.__run_level > 5:
-                msg = 'Older RTF\n'
+                msg = 'Older RTF\n' \
-                msg += 'self.__run_level is "%s"\n' % self.__run_level
+                'self.__run_level is "%s"\n' % self.__run_level
                raise RtfInvalidCodeException, msg
            if self.__run_level > 1:
                sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
                if self.__run_level > 1:
                    sys.stderr.write(
                        'File also has newer RTF.\n'
-                        'Will do the best to convert.\n'
+                        'Will do the best to convert...\n'
                    )
            add_brackets_obj = add_brackets.AddBrackets(
                    in_file = self.__temp_file,
--- a/src/calibre/ebooks/rtf2xml/add_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/add_brackets.py
@ -20,6 +20,9 @@ class AddBrackets:
    """
    Add brackets for old RTF.
    Logic:
    When control words without their own brackets are encountered
    and in the list of allowed words, this will add brackets
    to facilitate the treatment of the file
    """
    def __init__(self, in_file,
            bug_handler,
@ -41,53 +44,56 @@ class AddBrackets:
        self.__copy = copy
        self.__write_to = better_mktemp()
        self.__run_level = run_level
    def __initiate_values(self):
        """
        """
        self.__state_dict = {
            'before_body'           : self.__before_body_func,
            'in_body'               : self.__in_body_func,
            'after_control_word'    : self.__after_control_word_func,
            'in_ignore'             : self.__ignore_func,
        }
        self.__accept = [
            'cw<ci<bold______' ,
            'cw<ci<annotation' ,
            'cw<ci<blue______' ,
            # 'cw<ci<bold______' ,
            'cw<ci<caps______' ,
            'cw<ci<char-style' ,
            'cw<ci<dbl-strike' ,
            'cw<ci<emboss____' ,
            'cw<ci<engrave___' ,
            'cw<ci<font-color' ,
            'cw<ci<font-down_' ,
            'cw<ci<font-size_' ,
            'cw<ci<font-style' ,
            'cw<ci<font-up___' ,
            'cw<ci<footnot-mk' ,
            'cw<ci<green_____' ,
            'cw<ci<hidden____' ,
            'cw<ci<italics___' ,
            'cw<ci<outline___' ,
            'cw<ci<red_______' ,
            'cw<ci<shadow____' ,
            'cw<ci<small-caps' ,
            'cw<ci<strike-thr' ,
            'cw<ci<subscript_' ,
            'cw<ci<superscrip' ,
            'cw<ci<underlined' ,
            # 'cw<ul<underlined' ,
        ]
    def __initiate_values(self):
        """
        Init temp values
        """
        self.__state = 'before_body'
        self.__inline = {}
        self.__temp_group = []
-        self.__open_bracket = 0
+        self.__open_bracket = False
-        self.__found_brackets = 0
+        self.__found_brackets = False
-        self.__accept = [
+        
        'cw<ci<bold______',
        'cw<ci<annotation'  ,
        'cw<ci<blue______' ,
        'cw<ci<bold______' ,
        'cw<ci<caps______' ,
        'cw<ci<char-style' ,
        'cw<ci<dbl-strike' ,
        'cw<ci<emboss____'  ,
        'cw<ci<engrave___' ,
        'cw<ci<font-color' ,
        'cw<ci<font-down_' ,
        'cw<ci<font-size_' ,
        'cw<ci<font-style' ,
        'cw<ci<font-up___',
        'cw<ci<footnot-mk',
        'cw<ci<green_____' ,
        'cw<ci<hidden____',
        'cw<ci<italics___' ,
        'cw<ci<outline___',
        'cw<ci<red_______' ,
        'cw<ci<shadow____',
        'cw<ci<small-caps' ,
        'cw<ci<strike-thr',
        'cw<ci<subscript_' ,
        'cw<ci<superscrip',
        'cw<ci<underlined' ,
        # 'cw<ul<underlined' ,
        ]
    def __before_body_func(self, line):
        """
        If we are before the body, not interest in changing anything
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
@ -95,6 +101,14 @@ class AddBrackets:
    def __in_body_func(self, line):
        """
        Select what action to take in body:
            1-At the end of the file close the braket if a bracket was opened
            This happens if there is achange
            2-If an open bracket is found the code inside is ignore
            (written without modifications)
            3-If an accepted control word is found put the line
            in a buffer then chage state to after cw
            4-Else simply write the line
        """
        if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
            self.__write_obj.write(
@ -102,7 +116,7 @@ class AddBrackets:
                    )
            self.__write_obj.write(line)
        elif self.__token_info == 'ob<nu<open-brack':
-            self.__found_brackets = 1
+            self.__found_brackets = True
            self.__state = 'in_ignore'
            self.__ignore_count = self.__ob_count
            self.__write_obj.write(line)
@ -114,6 +128,10 @@ class AddBrackets:
    def __after_control_word_func(self, line):
        """
        After a cw either add next allowed cw to temporary list or
        change groupe and write it.
        If the token leading to an exit is an open bracket go to
        ignore otherwise goto in body
        """
        if self.__token_info in self.__accept:
            self.__temp_group.append(line)
@ -129,75 +147,77 @@ class AddBrackets:
    def __write_group(self):
        """
        Write a tempory group after accepted control words end
        But this is mostly useless in my opinion as there is no list of rejected cw
        This may be a way to implement future old rtf processing for cw
        Utility: open a group to just put brackets but why be so complicated?
        Scheme: open brackets, write cw then go to body and back with cw after 
        """
        if self.__open_bracket:
            self.__write_obj.write(
                'cb<nu<clos-brack<0003\n'
                )
-            self.__open_bracket = 0
+            self.__open_bracket = False
-        inline_string = ''
+
-        the_keys = self.__inline.keys()
+        inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
-        for the_key in the_keys:
+                for k, v in self.__inline.iteritems() \
-            value = self.__inline[the_key]
+                    if v != 'false'])
            if value != 'false':
                inline_string += '%s<nu<%s\n' % (the_key, value)
        if inline_string:
-            self.__write_obj.write('ob<nu<open-brack<0003\n')
+            self.__write_obj.write('ob<nu<open-brack<0003\n'
-            self.__write_obj.write(inline_string)
+                '%s' % inline_string)
-            self.__open_bracket = 1
+            self.__open_bracket = True
        self.__temp_group = []
    def __change_permanent_group(self):
        """
-        use temp group to change permanent group
+        Use temp group to change permanent group
        If the control word is not accepted remove it
        What is the interest as it is build to accept only accepted cw
        in __after_control_word_func?
        """
-        for line in self.__temp_group:
+        self.__inline = {line[:16] : line[20:-1]\
-            token_info = line[:16]
+            for line in self.__temp_group\
-            if token_info in self.__accept:
+            # Is this really necessary?
-                att = line[20:-1]
+                if line[:16] in self.__accept}
-                self.__inline[token_info] = att
+
    def __ignore_func(self, line):
        """
-        Don't add any brackets while inside of brackets RTF has already
+        Just copy data inside of RTF brackets already here.
        added.
        """
        self.__write_obj.write(line)
-        if self.__token_info == 'cb<nu<clos-brack'and\
+        if self.__token_info == 'cb<nu<clos-brack'\
-            self.__cb_count == self.__ignore_count:
+            and self.__cb_count == self.__ignore_count:
            self.__state = 'in_body'
    def __check_brackets(self, in_file):
-        self.__check_brack_obj = check_brackets.CheckBrackets\
+        """
        Return True if brackets match
        """
        check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
-        good_br =  self.__check_brack_obj.check_brackets()[0]
+        return check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
    def add_brackets(self):
        """
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        self.__write_obj = open(self.__write_to, 'w')
+            with open(self.__write_to, 'w') as self.__write_obj:
-        line_to_read = 1
+                for line in read_obj:
-        while line_to_read:
+                    self.__token_info = line[:16]
-            line_to_read = read_obj.readline()
+                    if self.__token_info == 'ob<nu<open-brack':
-            line = line_to_read
+                        self.__ob_count = line[-5:-1]
-            self.__token_info = line[:16]
+                    if self.__token_info == 'cb<nu<clos-brack':
-            if self.__token_info == 'ob<nu<open-brack':
+                        self.__cb_count = line[-5:-1]
-                self.__ob_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
-            if self.__token_info == 'cb<nu<clos-brack':
+                    if action is None:
-                self.__cb_count = line[-5:-1]
+                        sys.stderr.write(
-            action = self.__state_dict.get(self.__state)
+                            'No matching state in module add_brackets.py\n'
-            if action == None:
+                            '%s\n' % self.__state)
-                sys.stderr.write('No matching state in module add_brackets.py\n')
+                    action(line)
-                sys.stderr.write(self.__state + '\n')
+        #Check bad brackets
-            action(line)
+        if self.__check_brackets(self.__write_to):
        read_obj.close()
        self.__write_obj.close()
        bad_brackets = self.__check_brackets(self.__write_to)
        if not bad_brackets:
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            if self.__copy:
                copy_obj.copy_file(self.__write_to, "add_brackets.data")
--- a/src/calibre/ebooks/rtf2xml/char_set.py
+++ b/src/calibre/ebooks/rtf2xml/char_set.py
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@ -1,4 +1,5 @@
 import os, sys
 from codecs import EncodedFile
 from calibre.ebooks.rtf2xml import copy, check_encoding
 from calibre.ptempfile import better_mktemp
@ -41,6 +42,7 @@ class ConvertToTags:
        self.__run_level = run_level
        self.__write_to = better_mktemp()
        self.__convert_utf = False
        self.__bad_encoding = False
    def __initiate_values(self):
        """
@ -213,13 +215,14 @@ class ConvertToTags:
        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
-        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
            self.__convert_utf = True
        else:
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
                    ' hope for the best')
            self.__bad_encoding = True
        self.__new_line = 0
        self.__write_new_line()
        if self.__no_dtd:
@ -247,7 +250,7 @@ class ConvertToTags:
        the appropriate function.
        The functions that are called:
            a text function for text
-            an open funciton for open tags
+            an open function for open tags
            an open with attribute function for tags with attributes
            an empty with attribute function for tags that are empty but have
            attribtes.
@ -263,20 +266,19 @@ class ConvertToTags:
                    action = self.__state_dict.get(self.__token_info)
                    if action is not None:
                        action(line)
-        self.__write_obj.close()
+        #convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
-        #convert all encodings to UTF8 to avoid unsupported encodings in lxml
+        if self.__convert_utf or self.__bad_encoding:
        if self.__convert_utf:
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            copy_obj.rename(self.__write_to, self.__file)
            file_encoding = "utf-8"
            if self.__bad_encoding:
                file_encoding = "us-ascii"
            with open(self.__file, 'r') as read_obj:
                with open(self.__write_to, 'w') as write_obj:
-                    file = read_obj.read()
+                    write_objenc = EncodedFile(write_obj, self.__encoding,
-                    try:
+                                    file_encoding, 'replace')
-                        file = file.decode(self.__encoding)
+                    for line in read_obj:
-                        write_obj.write(file.encode('utf-8'))
+                        write_objenc.write(line)
                    except:
                        sys.stderr.write('Conversion to UTF-8 is not possible,'
                        ' encoding should be very carefully checked')
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
--- a/src/calibre/ebooks/rtf2xml/header.py
+++ b/src/calibre/ebooks/rtf2xml/header.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp
@ -31,29 +32,29 @@ class Header:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = better_mktemp()
-        self.__found_a_header = 0
+        self.__found_a_header = False
    def __in_header_func(self, line):
        """
        Handle all tokens that are part of header
        """
        if self.__cb_count == self.__header_bracket_count:
-            self.__in_header = 0
+            self.__in_header = False
            self.__write_obj.write(line)
            self.__write_to_head_obj.write(
-            'mi<mk<head___clo\n')
+            'mi<mk<head___clo\n' \
-            self.__write_to_head_obj.write(
+            'mi<tg<close_____<header-or-footer\n' \
            'mi<tg<close_____<header-or-footer\n')
            self.__write_to_head_obj.write(
            'mi<mk<header-clo\n')
        else:
            self.__write_to_head_obj.write(line)
    def __found_header(self, line):
        """
        Found a header
        """
        # but this could be header or footer
-        self.__found_a_header = 1
+        self.__found_a_header = True
-        self.__in_header = 1
+        self.__in_header = True
        self.__header_count += 1
        # temporarily set this to zero so I can enter loop
        self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
                    'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
                    )
        else:
-            sys.stderr.write('module is header\n')
+            sys.stderr.write(
-            sys.stderr.write('method is __found_header\n')
+            'module is header\n' \
-            sys.stderr.write('no dict entry\n')
+            'method is __found_header\n' \
-            sys.stderr.write('line is %s' % line)
+            'no dict entry\n' \
            'line is %s' % line)
            self.__write_to_head_obj.write(
                    'mi<tg<open-att__<header-or-footer<type>none\n'
                    )
    def __default_sep(self, line):
-        """Handle all tokens that are not header tokens"""
+        """
        Handle all tokens that are not header tokens
        """
        if self.__token_info[3:5] == 'hf':
            self.__found_header(line)
        self.__write_obj.write(line)
    def __initiate_sep_values(self):
        """
        initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__header_bracket_count = 0
-        self.__in_header = 0
+        self.__in_header = False
        self.__header_count = 0
        self.__head_dict = {
            'head-left_'        :   ('header-left'),
@ -101,6 +107,7 @@ class Header:
            'header____'        :   ('header' ),
            'footer____'        :   ('footer' ),
        }
    def separate_headers(self):
        """
        Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,53 +117,47 @@ class Header:
        bottom of the main file.
        """
        self.__initiate_sep_values()
        read_obj = open(self.__file)
        self.__write_obj = open(self.__write_to, 'w')
        self.__header_holder = better_mktemp()
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
+        with open(self.__file) as read_obj:
-        line_to_read = 1
+            with open(self.__write_to, 'w') as self.__write_obj:
-        while line_to_read:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
-            line_to_read = read_obj.readline()
+                    for line in read_obj:
-            line = line_to_read
+                        self.__token_info = line[:16]
-            self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
-            # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
-            if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
-                self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
-            if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
-                self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
-            # In the middle of footnote text
+                        if self.__in_header:
-            if self.__in_header:
+                            self.__in_header_func(line)
-                self.__in_header_func(line)
+                        # not in the middle of footnote text
-            # not in the middle of footnote text
+                        else:
-            else:
+                            self.__default_sep(line)
-                self.__default_sep(line)
+        
-        self.__write_obj.close()
+        with open(self.__header_holder, 'r') as read_obj:
-        read_obj.close()
+            with open(self.__write_to, 'a') as write_obj:
-        self.__write_to_head_obj.close()
+                write_obj.write(
-        read_obj = open(self.__header_holder, 'r')
+                'mi<mk<header-beg\n')
-        write_obj = open(self.__write_to, 'a')
+                for line in read_obj:
-        write_obj.write(
+                    write_obj.write(line)
-        'mi<mk<header-beg\n')
+                write_obj.write(
-        line = 1
+                'mi<mk<header-end\n')
        while line:
            line = read_obj.readline()
            write_obj.write(line)
        write_obj.write(
        'mi<mk<header-end\n')
        read_obj.close()
        write_obj.close()
        os.remove(self.__header_holder)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "header_separate.info")
+            copy_obj.copy_file(self.__write_to, "header_separate.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
    def update_info(self, file, copy):
        """
        Unused method
        """
        self.__file = file
        self.__copy = copy
    def __get_head_body_func(self, line):
        """
        Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
            self.__state = 'head'
        else:
            self.__write_obj.write(line)
    def __get_head_head_func(self, line):
        """
        Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
            self.__state = 'body'
        else:
            self.__write_to_head_obj.write(line)
    def __get_headers(self):
        """
        Private method to remove footnotes from main file.  Read one line from
@ -182,21 +185,16 @@ class Header:
        These two functions do the work of separating the footnotes form the
        body.
        """
-        read_obj = open(self.__file)
+        with open(self.__file) as read_obj:
-        self.__write_obj = open(self.__write_to, 'w')
+            with open(self.__write_to, 'w') as self.__write_obj:
-            # self.__write_to = "footnote_info.data"
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
+                    for line in read_obj:
-        line = 1
+                        self.__token_info = line[:16]
-        while line:
+                        if self.__state == 'body':
-            line = read_obj.readline()
+                            self.__get_head_body_func(line)
-            self.__token_info = line[:16]
+                        elif self.__state == 'head':
-            if self.__state == 'body':
+                            self.__get_head_head_func(line)
-                self.__get_head_body_func(line)
+
            elif self.__state == 'head':
                self.__get_head_head_func(line)
        read_obj.close()
        self.__write_obj.close()
        self.__write_to_head_obj.close()
    def __get_head_from_temp(self, num):
        """
        Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
        returns them as a string.
        """
        look_for = 'mi<mk<header-ope<' + num + '\n'
-        found_head = 0
+        found_head = False
        string_to_return = ''
-        line = 1
+        for line in self.__read_from_head_obj:
        while line:
            line = self.__read_from_head_obj.readline()
            if found_head:
                if line == 'mi<mk<header-clo\n':
                    return string_to_return
-                string_to_return = string_to_return + line
+                string_to_return += line
            else:
                if line == look_for:
-                    found_head = 1
+                    found_head = True
    def __join_from_temp(self):
        """
        Private method for rejoining footnotes to body.  Read from the
@ -227,15 +224,13 @@ class Header:
        If no footnote marker is found, simply print out the token (line).
        """
        self.__read_from_head_obj = open(self.__header_holder, 'r')
        read_obj = open(self.__write_to, 'r')
        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
+        with open(self.__write_to, 'r') as read_obj:
-        while line:
+            for line in read_obj:
-            line = read_obj.readline()
+                if line[:16] == 'mi<mk<header-ind':
-            if line[:16] == 'mi<mk<header-ind':
+                    line = self.__get_head_from_temp(line[17:-1])
-                line = self.__get_head_from_temp(line[17:-1])
+                self.__write_obj.write(line)
-            self.__write_obj.write(line)
+
        read_obj.close()
    def join_headers(self):
        """
        Join the footnotes from the bottom of the file and put them in their
--- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
+++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
@ -181,7 +181,7 @@ class Hex2Utf8:
            self.__dingbats_dict.update(dingbats_base_dict)
            self.__dingbats_dict.update(ms_dingbats_dict)
        # load dictionary for caps, and make a string for the replacement
-        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
+        self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
        # # print self.__caps_uni_dict
        # don't think I'll need this
        ##keys = self.__caps_uni_dict.keys()
--- a/src/calibre/ebooks/rtf2xml/old_rtf.py
+++ b/src/calibre/ebooks/rtf2xml/old_rtf.py
@ -11,14 +11,18 @@
 #                                                                       #
 #########################################################################
 import sys
-"""
+
 """
 class OldRtf:
    """
    Check to see if the RTF is an older version
    Logic:
    If allowable control word/properties happen in text without being enclosed
    in brackets the file will be considered old rtf
    """
-    def __init__(self, in_file, bug_handler, run_level ):
+    def __init__(self, in_file,
                bug_handler,
                run_level,
                ):
        """
        Required:
            'file'--file to parse
@ -32,46 +36,46 @@ class OldRtf:
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
-        self.__initiate_values()
+        self.__run_level = run_level
        self.__ob_group = 0
    def __initiate_values(self):
        self.__previous_token = ''
        self.__new_found = 0
        self.__allowable = [
-        'annotation' ,
+            'annotation' ,
-        'blue______'  ,
+            'blue______'  ,
-        'bold______',
+            'bold______',
-        'caps______',
+            'caps______',
-        'char-style' ,
+            'char-style' ,
-        'dbl-strike' ,
+            'dbl-strike' ,
-        'emboss____',
+            'emboss____',
-        'engrave___' ,
+            'engrave___' ,
-        'font-color',
+            'font-color',
-        'font-down_' ,
+            'font-down_' ,
-        'font-size_',
+            'font-size_',
-        'font-style',
+            'font-style',
-        'font-up___',
+            'font-up___',
-        'footnot-mk' ,
+            'footnot-mk' ,
-        'green_____' ,
+            'green_____' ,
-        'hidden____',
+            'hidden____',
-        'italics___',
+            'italics___',
-        'outline___',
+            'outline___',
-        'red_______',
+            'red_______',
-        'shadow____' ,
+            'shadow____' ,
-        'small-caps',
+            'small-caps',
-        'strike-thr',
+            'strike-thr',
-        'subscript_',
+            'subscript_',
-        'superscrip' ,
+            'superscrip' ,
-        'underlined' ,
+            'underlined' ,
        ]
        self.__state = 'before_body'
        self.__action_dict = {
            'before_body'   : self.__before_body_func,
            'in_body'       : self.__check_tokens_func,
            'after_pard'    : self.__after_pard_func,
        }
-        self.__is_old = 0
+
    def __initiate_values(self):
        self.__previous_token = ''
        self.__state = 'before_body'
        self.__found_new = 0
        self.__ob_group = 0
    def __check_tokens_func(self, line):
        if self.__inline_info in self.__allowable:
            if self.__ob_group == self.__base_ob_count:
@ -80,48 +84,56 @@ class OldRtf:
                self.__found_new += 1
        elif self.__token_info ==  'cw<pf<par-def___':
            self.__state = 'after_pard'
    def __before_body_func(self, line):
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
            self.__base_ob_count = self.__ob_group
    def __after_pard_func(self, line):
        if line[0:2] != 'cw':
            self.__state = 'in_body'
    def check_if_old_rtf(self):
        """
        Requires:
            nothing
        Returns:
-            1 if file is older RTf
+            True if file is older RTf
-            0 if file is newer RTF
+            False if file is newer RTF
        """
-
+        self.__initiate_values()
        read_obj = open(self.__file, 'r')
        line = 1
        line_num = 0
-        while line:
+        with open(self.__file, 'r') as read_obj:
-            line = read_obj.readline()
+            for line in read_obj:
-            line_num += 1
+                line_num += 1
-            self.__token_info = line[:16]
+                self.__token_info = line[:16]
-            if self.__token_info == 'mi<mk<body-close':
+                if self.__token_info == 'mi<mk<body-close':
-                return 0
+                    return False
-                self.__ob_group = 0
+                if self.__token_info == 'ob<nu<open-brack':
-            if self.__token_info == 'ob<nu<open-brack':
+                    self.__ob_group += 1
-                self.__ob_group += 1
+                    self.__ob_count = line[-5:-1]
-                self.__ob_count = line[-5:-1]
+                if self.__token_info == 'cb<nu<clos-brack':
-            if self.__token_info == 'cb<nu<clos-brack':
+                    self.__ob_group -= 1
-                self.__ob_group -= 1
+                    self.__cb_count = line[-5:-1]
-                self.__cb_count = line[-5:-1]
+                self.__inline_info = line[6:16]
-            self.__inline_info = line[6:16]
+                if self.__state == 'after_body':
-            if self.__state == 'after_body':
+                    return False
-                return 0
+                action = self.__action_dict.get(self.__state)
-            action = self.__action_dict.get(self.__state)
+                if action is None:
-            if not action:
+                    try:
-                sys.stderr.write('No action for state!\n')
+                        sys.stderr.write('No action for this state!\n')
-            result = action(line)
+                    except:
-            if result == 'new_rtf':
+                        pass
-                return 0
+                result = action(line)
-            elif result == 'old_rtf':
+                if result == 'new_rtf':
-                return 1
+                    return False
-            self.__previous_token = line[6:16]
+                elif result == 'old_rtf':
-        return 0
+                    if self.__run_level > 3:
                        sys.stderr.write(
                            'Old rtf construction %s (bracket %s, line %s)\n' 
                                % (self.__inline_info, str(self.__ob_group), line_num)
                        )
                    return True
                self.__previous_token = line[6:16]
        return False
--- a/src/calibre/ebooks/rtf2xml/output.py
+++ b/src/calibre/ebooks/rtf2xml/output.py
@ -10,7 +10,9 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, codecs
+import sys, os
 # , codecs
 class Output:
    """
    Output file
@ -19,7 +21,8 @@ class Output:
            file,
            orig_file,
            output_dir = None,
-            out_file = None
+            out_file = None,
            no_ask = True
            ):
        """
        Required:
@ -33,8 +36,9 @@ class Output:
        self.__file = file
        self.__orig_file = orig_file
        self.__output_dir = output_dir
-        self.__no_ask = 1
+        self.__no_ask = no_ask
        self.__out_file = out_file
    def output(self):
        """
        Required:
@ -45,13 +49,14 @@ class Output:
            output the line to the screen if no output file given. Otherwise, output to
            the file.
        """
        # self.__output_xml(self.__file, self.__out_file)
        if self.__output_dir:
            self.__output_to_dir_func()
        elif self.__out_file:
-            self.__output_xml(self.__file, self.__out_file)
+            self.__output_to_file_func()
            # self.__output_xml(self.__file, self.__out_file)
        else:
            self.__output_to_standard_func()
    def __output_to_dir_func(self):
        """
        Requires:
@ -64,32 +69,25 @@ class Output:
        """
        base_name = os.path.basename(self.__orig_file)
        base_name, ext  = os.path.splitext(base_name)
-        output_file = '%s.xml' % base_name
+        output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
        output_file = os.path.join(self.__output_dir, output_file)
        # change if user wants to output to a specific file
        if self.__out_file:
            output_file = os.path.join(self.__output_dir, self.__out_file)
        user_response = 'o'
-        if os.path.isfile(output_file):
+        if os.path.isfile(output_file) and not self.__no_ask:
-            if self.__no_ask:
+            msg = 'Do you want to overwrite %s?\n' % output_file
-                user_response = 'o'
+            msg += ('Type "o" to overwrite.\n'
-            else:
+                    'Type any other key to print to standard output.\n')
-                msg = 'Do you want to over-write %s?\n' % output_file
+            sys.stderr.write(msg)
-                msg += 'Type "o" to over-write.\n'
+            user_response = raw_input()
                msg += 'Type any other key to print to standard output.\n'
                sys.stderr.write(msg)
                user_response = raw_input()
        if user_response == 'o':
-            read_obj = open(self.__file, 'r')
+            with open(self.__file, 'r') as read_obj:
-            write_obj = open(output_file, 'w')
+                with open(self.output_file, 'w') as write_obj:
-            line = 1
+                    for line in read_obj:
-            while line:
+                        write_obj.write(line)
                line = read_obj.readline()
                write_obj.write(line)
            read_obj.close()
            write_obj.close()
        else:
            self.__output_to_standard_func()
    def __output_to_file_func(self):
        """
        Required:
@ -99,14 +97,11 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        write_obj = open(self.__out_file, 'w')
+            with open(self.__out_file, 'w') as write_obj:
-        line = 1
+                for line in read_obj:
-        while line:
+                    write_obj.write(line)
-            line = read_obj.readline()
+
            write_obj.write(line)
        read_obj.close()
        write_obj.close()
    def __output_to_standard_func(self):
        """
        Required:
@ -116,26 +111,24 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        line = 1
+            for line in read_obj:
-        while line:
+                sys.stdout.write(line)
-            line = read_obj.readline()
+
-            sys.stdout.write(line)
+    # def __output_xml(self, in_file, out_file):
-        read_obj.close()
+        # """
-    def __output_xml(self, in_file, out_file):
+        # output the ill-formed xml file
-        """
+        # """
-        output the ill-formed xml file
+        # (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
-        """
+        # write_obj = utf8_writer(open(out_file, 'w'))
-        (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
+        # write_obj = open(out_file, 'w')
-        write_obj = utf8_writer(open(out_file, 'w'))
+        # read_obj = utf8_writer(open(in_file, 'r'))
-        write_obj = open(out_file, 'w')
+        # read_obj = open(in_file, 'r')
-        read_obj = utf8_writer(open(in_file, 'r'))
+        # line = 1
-        read_obj = open(in_file, 'r')
+        # while line:
-        line = 1
+            # line = read_obj.readline()
-        while line:
+            # if isinstance(line, type(u"")):
-            line = read_obj.readline()
+                # line = line.encode("utf-8")
-            if isinstance(line, type(u"")):
+            # write_obj.write(line)
-                line = line.encode("utf-8")
+        # read_obj.close()
-            write_obj.write(line)
+        # write_obj.close()
        read_obj.close()
        write_obj.close()
--- a/src/calibre/ebooks/rtf2xml/paragraphs.py
+++ b/src/calibre/ebooks/rtf2xml/paragraphs.py
@ -11,31 +11,32 @@
 #                                                                       #
 #########################################################################
 import sys, os
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp
 class Paragraphs:
    """
-=================
+    =================
-Purpose
+    Purpose
-=================
+    =================
-Write paragraph tags for a tokenized file. (This module won't be any use to use
+    Write paragraph tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
+    to you unless you use it as part of the other modules.)
-------------
+    -------------
-Method
+    Method
-------------
+    -------------
-RTF does not tell you when a paragraph begins. It only tells you when the
+    RTF does not tell you when a paragraph begins. It only tells you when the
-paragraph ends.
+    paragraph ends.
-In order to make paragraphs out of this limited info, the parser starts in the
+    In order to make paragraphs out of this limited info, the parser starts in the
-body of the documents and assumes it is not in a paragraph. It looks for clues
+    body of the documents and assumes it is not in a paragraph. It looks for clues
-to begin a paragraph. Text starts a paragraph; so does an inline field or
+    to begin a paragraph. Text starts a paragraph; so does an inline field or
-list-text. If an end of paragraph marker (\par) is found, then this indicates
+    list-text. If an end of paragraph marker (\par) is found, then this indicates
-a blank paragraph.
+    a blank paragraph.
-Once a paragraph is found, the state changes to 'paragraph.' In this state,
+    Once a paragraph is found, the state changes to 'paragraph.' In this state,
-clues are looked to for the end of a paragraph. The end of a paragraph marker
+    clues are looked to for the end of a paragraph. The end of a paragraph marker
-(\par) marks the end of a paragraph. So does the end of a footnote or heading;
+    (\par) marks the end of a paragraph. So does the end of a footnote or heading;
-a paragraph definintion; the end of a field-block; and the beginning of a
+    a paragraph definition; the end of a field-block; and the beginning of a
-section. (How about the end of a section or the end of a field-block?)
+    section. (How about the end of a section or the end of a field-block?)
    """
    def __init__(self,
            in_file,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_empty_para = write_empty_para
        self.__run_level = run_level
        self.__write_to = better_mktemp()
    def __initiate_values(self):
        """
        Initiate all values.
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__paragraph_dict = {
        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
-        ##'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
+        ## 'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
        'mi<mk<body-close'      : self.__close_para_func,   # end of body
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
        'mi<mk<pict-start'      : self.__start_para_func,
        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
        }
    def __before_body_func(self, line):
        """
        Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'not_paragraph'
        self.__write_obj.write(line)
    def __not_paragraph_func(self, line):
        """
        Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
        if action:
            action(line)
        self.__write_obj.write(line)
    def __paragraph_func(self, line):
        """
        Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
            action(line)
        else:
            self.__write_obj.write(line)
    def __start_para_func(self, line):
        """
        Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
        )
        self.__write_obj.write(self.__start2_marker)
        self.__state = 'paragraph'
    def __empty_para_func(self, line):
        """
        Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
            'mi<tg<empty_____<para\n'
            )
            self.__write_obj.write(self.__end_marker)   # marker for later parsing
    def __empty_pgbk_func(self, line):
        """
        Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(
        'mi<tg<empty_____<page-break\n'
        )
    def __close_para_func(self, line):
        """
        Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(self.__end_marker) # marker for later parser
        self.__write_obj.write(line)
        self.__state = 'not_paragraph'
    def __bogus_para__def_func(self, line):
        """
        Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
            if a \pard occurs in a paragraph, I want to ignore it. (I believe)
        """
        self.__write_obj.write('mi<mk<bogus-pard\n')
    def make_paragraphs(self):
        """
        Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
            only other state is 'paragraph'.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        self.__write_obj = open(self.__write_to, 'w')
+            with open(self.__write_to, 'w') as self.__write_obj:
-        line_to_read = 1
+                for line in read_obj:
-        while line_to_read:
+                    self.__token_info = line[:16]
-            line_to_read = read_obj.readline()
+                    action = self.__state_dict.get(self.__state)
-            line = line_to_read
+                    if action is None:
-            self.__token_info = line[:16]
+                        try:
-            action = self.__state_dict.get(self.__state)
+                            sys.stderr.write('no matching state in module paragraphs.py\n')
-            if action == None:
+                            sys.stderr.write(self.__state + '\n')
-                sys.stderr.write('no no matching state in module sections.py\n')
+                        except:
-                sys.stderr.write(self.__state + '\n')
+                            pass
-            action(line)
+                    action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "paragraphs.data")
--- a/src/calibre/ebooks/rtf2xml/preamble_rest.py
+++ b/src/calibre/ebooks/rtf2xml/preamble_rest.py
@ -11,16 +11,24 @@
 #                                                                       #
 #########################################################################
 import sys,os
 from calibre.ebooks.rtf2xml import copy
 class Preamble:
    """
    Fix the reamaing parts of the preamble. This module does very little. It
    makes sure that no text gets put in the revision of list table. In the
-    future, when I understand how to interprett he revision table and list
+    future, when I understand how to interpret the revision table and list
    table, I will make these methods more functional.
    """
-    def __init__(self, file, bug_handler,  platform, default_font, code_page,
+    def __init__(self, file,
-    copy=None, temp_dir=None):
+                bug_handler,
                platform,
                default_font,
                code_page,
                copy=None,
                temp_dir=None,
                ):
        """
        Required:
            file--file to parse
@ -44,6 +52,7 @@ class Preamble:
            self.__write_to = os.path.join(temp_dir,"info_table_info.data")
        else:
            self.__write_to = "info_table_info.data"
    def __initiate_values(self):
        """
        Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
        'mi<mk<revtbl-beg'      : self.__found_revision_table_func,
        'mi<mk<body-open_'      : self.__found_body_func,
        }
    def __default_func(self, line):
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
        else:
            self.__write_obj.write(line)
    def __found_rtf_head_func(self, line):
        """
        Requires:
@ -84,8 +95,10 @@ class Preamble:
            '<platform>%s\n' % (self.__default_font, self.__code_page,
            self.__platform)
        )
    def __found_list_table_func(self, line):
        self.__state = 'list_table'
    def __list_table_func(self, line):
        if self.__token_info == 'mi<mk<listabend_':
            self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
    def __found_revision_table_func(self, line):
        self.__state = 'revision'
    def __revision_table_func(self, line):
        if self.__token_info == 'mi<mk<revtbl-end':
            self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
    def __body_func(self, line):
        self.__write_obj.write(line)
    def fix_preamble(self):
        """
        Requires:
@ -119,20 +137,15 @@ class Preamble:
            the list table.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        self.__write_obj = open(self.__write_to, 'w')
+            with open(self.__write_to, 'w') as self.__write_obj:
-        line_to_read = 1
+                for line in read_obj:
-        while line_to_read:
+                    self.__token_info = line[:16]
-            line_to_read = read_obj.readline()
+                    action = self.__state_dict.get(self.__state)
-            line = line_to_read
+                    if action is None:
-            self.__token_info = line[:16]
+                        sys.stderr.write(
-            action = self.__state_dict.get(self.__state)
+                        'no matching state in module preamble_rest.py\n' + self.__state + '\n')
-            if action == None:
+                    action(line)
                sys.stderr.write('no no matching state in module preamble_rest.py\n')
                sys.stderr.write(self.__state + '\n')
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_div.data")
--- a/src/calibre/ebooks/rtf2xml/sections.py
+++ b/src/calibre/ebooks/rtf2xml/sections.py
@ -11,43 +11,44 @@
 #                                                                       #
 #########################################################################
 import sys, os
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp
 class Sections:
    """
-=================
+    =================
-Purpose
+    Purpose
-=================
+    =================
-Write section tags for a tokenized file. (This module won't be any use to use
+    Write section tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
+    to you unless you use it as part of the other modules.)
---------------
+    ---------------
-logic
+    logic
---------------
+    ---------------
-The tags for the first section breaks have already been written.
+    The tags for the first section breaks have already been written.
-RTF stores section breaks with the \sect tag. Each time this tag is
+    RTF stores section breaks with the \sect tag. Each time this tag is
-encountered, add one to the counter.
+    encountered, add one to the counter.
-When I encounter the \sectd tag, I want to collect all the appropriate tokens
+    When I encounter the \sectd tag, I want to collect all the appropriate tokens
-that describe the section. When I reach a \pard, I know I an stop collecting
+    that describe the section. When I reach a \pard, I know I an stop collecting
-tokens and write the section tags.
+    tokens and write the section tags.
-The exception to this method occurs when sections occur in field blocks, such
+    The exception to this method occurs when sections occur in field blocks, such
-as the index. Normally, two section break occur within the index and other
+    as the index. Normally, two section break occur within the index and other
-field-blocks. (If less or more section breaks occurr, this code may not work.)
+    field-blocks. (If less or more section breaks occurr, this code may not work.)
-I want the sections to occurr outside of the index. That is, the index
+    I want the sections to occur outside of the index. That is, the index
-should be nested inside one section tag. After the index is complete, a new
+    should be nested inside one section tag. After the index is complete, a new
-section should begin.
+    section should begin.
-In order to write the sections outside of the field blocks, I have to store
+    In order to write the sections outside of the field blocks, I have to store
-all of the field block as a string. When I ecounter the \sect tag, add one to
+    all of the field block as a string. When I ecounter the \sect tag, add one to
-the section counter, but store this number in a list. Likewise, store the
+    the section counter, but store this number in a list. Likewise, store the
-information describing the section in another list.
+    information describing the section in another list.
-When I reach the end of the field block, choose the first item from the
+    When I reach the end of the field block, choose the first item from the
-numbered list as the section number. Choose the first item in the description
+    numbered list as the section number. Choose the first item in the description
-list as the values and attributes of the section. Enclose the field string
+    list as the values and attributes of the section. Enclose the field string
-between the section tags.
+    between the section tags.
-Start a new section outside the field-block strings. Use the second number in
+    Start a new section outside the field-block strings. Use the second number in
-the list; use the second item in the description list.
+    the list; use the second item in the description list.
-CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
+    CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
-Instead, ingore all section information in a field-block.
+    Instead, ingore all section information in a field-block.
    """
    def __init__(self,
            in_file,