diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl
index ea1fc71172..6db1c0388d 100644
--- a/resources/templates/rtf.xsl
+++ b/resources/templates/rtf.xsl
@@ -287,7 +287,7 @@
                 <xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
                 <xsl:text>]</xsl:text>
             </xsl:when>
-            <xsl:when test="(@superscript = 'true')">
+            <xsl:when test="(@superscript)">
                 <xsl:element name="sup">
                     <xsl:element name="span">
                         <xsl:attribute name="class">
@@ -297,7 +297,7 @@
                     </xsl:element>
                 </xsl:element>
             </xsl:when>
-            <xsl:when test="(@underscript = 'true')">
+            <xsl:when test="(@underscript or @subscript)">
                 <xsl:element name="sub">
                     <xsl:element name="span">
                         <xsl:attribute name="class">
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 714a5b656f..ba13668eb7 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin):
 
     def generate_xml(self, stream):
         from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
-        ofile = 'out.xml'
+        ofile = 'dataxml.xml'
+        run_lev, debug_dir = 1, None
+        if getattr(self.opts, 'debug_pipeline', None) is not None:
+            try:
+                os.mkdir(debug_dir)
+                debug_dir = 'rtfdebug'
+                run_lev = 4
+            except:
+                pass
         parser = ParseRtf(
             in_file    = stream,
             out_file   = ofile,
@@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin):
 
             # Write or do not write paragraphs. Default is 0.
             empty_paragraphs = 1,
+
+            #debug
+            deb_dir = debug_dir,
+            run_level = run_lev,
         )
         parser.parse_rtf()
-        ans = open('out.xml').read()
-        os.remove('out.xml')
-        return ans
+        with open(ofile, 'rb') as f:
+            return f.read()
 
     def extract_images(self, picts):
+        import imghdr
         self.log('Extracting images...')
 
+        with open(picts, 'rb') as f:
+            raw = f.read()
+        picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
+        hex = re.compile(r'[^a-fA-F0-9]')
+        encs = [hex.sub('', pict) for pict in picts]
+
         count = 0
-        raw = open(picts, 'rb').read()
-        starts = []
-        for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
-            starts.append(match.start(1))
-
         imap = {}
-
-        for start in starts:
-            pos, bc = start, 1
-            while bc > 0:
-                if raw[pos] == '}': bc -= 1
-                elif raw[pos] == '{': bc += 1
-                pos += 1
-            pict = raw[start:pos+1]
-            enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
+        for enc in encs:
             if len(enc) % 2 == 1:
                 enc = enc[:-1]
             data = enc.decode('hex')
+            fmt = imghdr.what(None, data)
+            if fmt is None:
+                fmt = 'wmf'
             count += 1
-            name = (('%4d'%count).replace(' ', '0'))+'.wmf'
-            open(name, 'wb').write(data)
+            name = '%04d.%s' % (count, fmt)
+            with open(name, 'wb') as f:
+                f.write(data)
             imap[count] = name
             #open(name+'.hex', 'wb').write(enc)
         return self.convert_images(imap)
 
     def convert_images(self, imap):
-        for count, val in imap.items():
+        self.default_img = None
+        for count, val in imap.iteritems():
             try:
                 imap[count] = self.convert_image(val)
             except:
@@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin):
         return imap
 
     def convert_image(self, name):
+        if not name.endswith('.wmf'):
+            return name
         try:
             return self.rasterize_wmf(name)
         except:
@@ -167,16 +179,18 @@ class RTFInput(InputFormatPlugin):
 
     def replace_wmf(self, name):
         from calibre.ebooks import calibre_cover
-        data = calibre_cover('Conversion of WMF images is not supported',
+        if self.default_img is None:
+            self.default_img = calibre_cover('Conversion of WMF images is not supported',
             'Use Microsoft Word or OpenOffice to save this RTF file'
             ' as HTML and convert that in calibre.', title_size=36,
             author_size=20)
         name = name.replace('.wmf', '.jpg')
         with open(name, 'wb') as f:
-            f.write(data)
+            f.write(self.default_img)
         return name
 
     def rasterize_wmf(self, name):
+        raise ValueError('Conversion of WMF images not supported')
         from calibre.utils.wmf import extract_raster_image
         with open(name, 'rb') as f:
             data = f.read()
@@ -212,27 +226,27 @@ class RTFInput(InputFormatPlugin):
         css += '\n'+'\n'.join(font_size_classes)
         css += '\n' +'\n'.join(color_classes)
 
-        for cls, val in border_styles.items():
+        for cls, val in border_styles.iteritems():
             css += '\n\n.%s {\n%s\n}'%(cls, val)
 
         with open('styles.css', 'ab') as f:
             f.write(css)
 
-    def preprocess(self, fname):
-        self.log('\tPreprocessing to convert unicode characters')
-        try:
-            data = open(fname, 'rb').read()
-            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
-            tokenizer = RtfTokenizer(data)
-            tokens = RtfTokenParser(tokenizer.tokens)
-            data = tokens.toRTF()
-            fname = 'preprocessed.rtf'
-            with open(fname, 'wb') as f:
-                f.write(data)
-        except:
-            self.log.exception(
-            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
-        return fname
+    # def preprocess(self, fname):
+        # self.log('\tPreprocessing to convert unicode characters')
+        # try:
+            # data = open(fname, 'rb').read()
+            # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+            # tokenizer = RtfTokenizer(data)
+            # tokens = RtfTokenParser(tokenizer.tokens)
+            # data = tokens.toRTF()
+            # fname = 'preprocessed.rtf'
+            # with open(fname, 'wb') as f:
+                # f.write(data)
+        # except:
+            # self.log.exception(
+            # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+        # return fname
 
     def convert_borders(self, doc):
         border_styles = []
@@ -269,17 +283,14 @@ class RTFInput(InputFormatPlugin):
         self.log = log
         self.log('Converting RTF to XML...')
         #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        # fname = self.preprocess(stream.name)
         try:
-            xml = self.generate_xml(fname)
+            xml = self.generate_xml(stream.name)
         except RtfInvalidCodeException, e:
+            raise
             raise ValueError(_('This RTF file has a feature calibre does not '
             'support. Convert it to HTML first and then try it.\n%s')%e)
 
-        '''dataxml = open('dataxml.xml', 'w')
-        dataxml.write(xml)
-        dataxml.close'''
-
         d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
         if d:
             imap = {}
diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index 7b89407f79..cdd9a3d088 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -17,7 +17,8 @@
 #########################################################################
 # $Revision: 1.41 $
 # $Date: 2006/03/24 23:50:07 $
-import sys,os
+import sys, os
+
 from calibre.ebooks.rtf2xml import headings_to_sections, \
     line_endings, footnote, fields_small, default_encoding, \
     make_lists, preamble_div, header, colors, group_borders, \
@@ -90,7 +91,6 @@ class ParseRtf:
                 out_file = '',
                 out_dir = None,
                 dtd = '',
-                #debug = 0, #why? calibre
                 deb_dir = None,
                 convert_symbol = None,
                 convert_wingdings = None,
@@ -107,6 +107,7 @@ class ParseRtf:
                 no_dtd = 0,
                 char_data = '',
                 ):
+
         """
         Requires:
         'file' --file to parse
@@ -119,12 +120,11 @@ class ParseRtf:
             script tries to output to directory where is script is exectued.)
             'deb_dir' --debug directory. If a debug_dir is provided, the script
             will copy each run through as a file to examine in the debug_dir
-            'perl_script'--use perl to make tokens. This runs just a bit faster.
-            (I will probably phase this out.)
             'check_brackets' -- make sure the brackets match up after each run
             through a file. Only for debugging.
         Returns: Nothing
         """
+
         self.__file = in_file
         self.__out_file = out_file
         self.__out_dir = out_dir
@@ -132,7 +132,7 @@ class ParseRtf:
         self.__dtd_path = dtd
         self.__check_file(in_file,"file_to_parse")
         self.__char_data = char_data
-        self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
+        self.__debug_dir = deb_dir
         self.__check_dir(self.__temp_dir)
         self.__copy = self.__check_dir(self.__debug_dir)
         self.__convert_caps = convert_caps
@@ -155,25 +155,24 @@ class ParseRtf:
         if hasattr(the_file, 'read'): return
         if the_file == None:
             if type == "file_to_parse":
-                message = "You must provide a file for the script to work"
-            msg = message
+                msg = "\nYou must provide a file for the script to work"
             raise RtfInvalidCodeException, msg
         elif os.path.exists(the_file):
             pass # do nothing
         else:
-            message = "The file '%s' cannot be found" % the_file
-            msg = message
+            msg = "\nThe file '%s' cannot be found" % the_file
             raise RtfInvalidCodeException, msg
+
     def __check_dir(self, the_dir):
         """Check to see if directory exists"""
         if not the_dir :
             return
         dir_exists = os.path.isdir(the_dir)
         if not dir_exists:
-            message = "%s is not a directory" % the_dir
-            msg = message
+            msg = "\n%s is not a directory" % the_dir
             raise RtfInvalidCodeException, msg
         return 1
+
     def parse_rtf(self):
         """
         Parse the file by calling on other classes.
@@ -194,13 +193,14 @@ class ParseRtf:
             copy_obj.set_dir(self.__debug_dir)
             copy_obj.remove_files()
             copy_obj.copy_file(self.__temp_file, "original_file")
-        # new as of 2005-08-02. Do I want this?
+        # Function to check if bracket are well handled
         if self.__debug_dir or self.__run_level > 2:
             self.__check_brack_obj = check_brackets.CheckBrackets\
             (file = self.__temp_file,
                 bug_handler = RtfInvalidCodeException,
                     )
-        # convert Macintosh line endings to Unix line endings
+        #convert Macintosh and Windows line endings to Unix line endings
+        #why do this if you don't wb after?
         line_obj = line_endings.FixLineEndings(
                 in_file = self.__temp_file,
                 bug_handler = RtfInvalidCodeException,
@@ -208,13 +208,13 @@ class ParseRtf:
                 run_level = self.__run_level,
                 replace_illegals = self.__replace_illegals,
                 )
-        return_value = line_obj.fix_endings()
+        return_value = line_obj.fix_endings() #calibre return what?
         self.__return_code(return_value)
         tokenize_obj = tokenize.Tokenize(
                 bug_handler = RtfInvalidCodeException,
                 in_file = self.__temp_file,
                 copy = self.__copy,
-                run_level = self.__run_level,)
+                run_level = self.__run_level)
         tokenize_obj.tokenize()
         process_tokens_obj = process_tokens.ProcessTokens(
             in_file = self.__temp_file,
@@ -230,12 +230,25 @@ class ParseRtf:
                 os.remove(self.__temp_file)
             except OSError:
                 pass
+            #Check to see if the file is correctly encoded
+            encode_obj = default_encoding.DefaultEncoding(
+            in_file = self.__temp_file,
+            run_level = self.__run_level,
+            bug_handler = RtfInvalidCodeException,
+            check_raw = True,
+            )
+            platform, code_page, default_font_num = encode_obj.find_default_encoding()
             check_encoding_obj = check_encoding.CheckEncoding(
-                bug_handler = RtfInvalidCodeException,
-                    )
-            check_encoding_obj.check_encoding(self.__file)
-            sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
-            raise InvalidRtfException, msg
+                    bug_handler = RtfInvalidCodeException,
+                        )
+            enc = encode_obj.get_codepage()
+            if enc != 'mac_roman':
+                enc = 'cp' + enc
+            if check_encoding_obj.check_encoding(self.__file, enc):
+                file_name = self.__file if isinstance(self.__file, str) \
+                                    else self.__file.encode('utf-8')
+                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
+                raise InvalidRtfException, msg
         delete_info_obj = delete_info.DeleteInfo(
             in_file = self.__temp_file,
             copy = self.__copy,
@@ -508,6 +521,7 @@ class ParseRtf:
                 indent = self.__indent,
                 run_level = self.__run_level,
                 no_dtd = self.__no_dtd,
+                encoding = encode_obj.get_codepage(),
                 bug_handler = RtfInvalidCodeException,
                 )
         tags_obj.convert_to_tags()
@@ -520,35 +534,28 @@ class ParseRtf:
         output_obj.output()
         os.remove(self.__temp_file)
         return self.__exit_level
+
     def __bracket_match(self, file_name):
         if self.__run_level > 2:
             good_br, msg =  self.__check_brack_obj.check_brackets()
             if good_br:
                 pass
-                # sys.stderr.write( msg + ' in ' + file_name + "\n")
+                #sys.stderr.write( msg + ' in ' + file_name + "\n")
             else:
-                msg += msg +  " in file '" + file_name + "'\n"
+                msg = '%s in file %s\n' % (msg, file_name)
                 raise RtfInvalidCodeException, msg
+
     def __return_code(self, num):
-        if num == None:
-            return
-        if int(num) > self.__exit_level:
-            self.__exit_level = num
+      if num == None:
+          return
+      if int(num) > self.__exit_level:
+          self.__exit_level = num
+
     def __make_temp_file(self,file):
         """Make a temporary file to parse"""
         write_file="rtf_write_file"
         read_obj = file if hasattr(file, 'read') else open(file,'r')
-        write_obj = open(write_file, 'w')
-        line = "dummy"
-        while line:
-            line = read_obj.read(1000)
-            write_obj.write(line )
-        write_obj.close()
+        with open(write_file, 'wb') as write_obj:
+            for line in read_obj:
+                write_obj.write(line)
         return write_file
-    """
-mi<tg<open______<style-sheet\n
-mi<tg<close_____<style-sheet\n
-mi<tg<open-att__<footnote<num>1\n
-mi<tg<empty-att_<page-definition<margin>33\n
-mi<tg<empty_____<para\n
-"""
diff --git a/src/calibre/ebooks/rtf2xml/check_brackets.py b/src/calibre/ebooks/rtf2xml/check_brackets.py
index 418469467d..361cc034e0 100755
--- a/src/calibre/ebooks/rtf2xml/check_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/check_brackets.py
@@ -24,38 +24,38 @@ class CheckBrackets:
         self.__ob_count = 0
         self.__cb_count = 0
         self.__open_bracket_num = []
+
     def open_brack(self, line):
         num = line[-5:-1]
         self.__open_bracket_num.append(num)
         self.__bracket_count += 1
+
     def close_brack(self, line):
         num = line[-5:-1]
-        ##self.__open_bracket_num.append(num)
         try:
             last_num = self.__open_bracket_num.pop()
         except:
-            return 0
+            return False
         if num != last_num:
-            return 0
+            return False
         self.__bracket_count -= 1
-        return 1
+        return True
+
     def check_brackets(self):
-        read_obj = open(self.__file, 'r')
-        line = 'dummy'
         line_count = 0
-        while line:
-            line_count += 1
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.open_brack(line)
-            if self.__token_info == 'cb<nu<clos-brack':
-                right_count = self.close_brack(line)
-                if not right_count:
-                    return (0, "closed bracket doesn't match, line %s" % line_count)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                line_count += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.open_brack(line)
+                if self.__token_info == 'cb<nu<clos-brack':
+                    if not self.close_brack(line):
+                        return (False, "closed bracket doesn't match, line %s" % line_count)
+
         if self.__bracket_count != 0:
-            msg = 'At end of file open and closed brackets don\'t match\n'
-            msg = msg + 'total number of brackets is %s' % self.__bracket_count
-            return (0, msg)
-        return (1, "brackets match!")
+            msg = ('At end of file open and closed brackets don\'t match\n' \
+                        'total number of brackets is %s') % self.__bracket_count
+            return (False, msg)
+        return (True, "Brackets match!")
+
diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py
index f6810e4909..0f52320aea 100755
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 import sys
+
 class CheckEncoding:
+
     def __init__(self, bug_handler):
         self.__bug_handler = bug_handler
+
     def __get_position_error(self, line, encoding, line_num):
         char_position = 0
         for char in line:
@@ -12,21 +15,23 @@ class CheckEncoding:
             except UnicodeError, msg:
                 sys.stderr.write('line: %s char: %s\n' %  (line_num, char_position))
                 sys.stderr.write(str(msg) + '\n')
-    def check_encoding(self, path, encoding='us-ascii'):
-        read_obj = open(path, 'r')
-        line_to_read = 1
+
+    def check_encoding(self, path, encoding='us-ascii', verbose=True):
         line_num = 0
-        while line_to_read:
-            line_num += 1
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            try:
-                line.decode(encoding)
-            except UnicodeError:
-                if len(line) < 1000:
-                    self.__get_position_error(line, encoding, line_num)
-                else:
-                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
+        with open(path, 'r') as read_obj:
+            for line in read_obj:
+                line_num += 1
+                try:
+                    line.decode(encoding)
+                except UnicodeError:
+                    if verbose:
+                        if len(line) < 1000:
+                            self.__get_position_error(line, encoding, line_num)
+                        else:
+                            sys.stderr.write('line: %d has bad encoding\n' % line_num)
+                    return True
+        return False
+
 if __name__ == '__main__':
     check_encoding_obj = CheckEncoding()
     check_encoding_obj.check_encoding(sys.argv[1])
diff --git a/src/calibre/ebooks/rtf2xml/combine_borders.py b/src/calibre/ebooks/rtf2xml/combine_borders.py
index 71cd822e30..eaf09d0842 100755
--- a/src/calibre/ebooks/rtf2xml/combine_borders.py
+++ b/src/calibre/ebooks/rtf2xml/combine_borders.py
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class CombineBorders:
     """Combine borders in RTF tokens to make later processing easier"""
     def __init__(self,
@@ -32,28 +34,31 @@ class CombineBorders:
         self.__state = 'default'
         self.__bord_pos = 'default'
         self.__bord_att = []
+
     def found_bd(self, line):
         #cw<bd<bor-t-r-vi
         self.__state = 'border'
         self.__bord_pos = line[6:16]
+
     def __default_func(self, line):
         #cw<bd<bor-t-r-vi
         if self.__first_five == 'cw<bd':
             self.found_bd(line)
             return ''
         return line
+
     def end_border(self, line, write_obj):
-        joiner = "|"
-        border_string = joiner.join(self.__bord_att)
+        border_string = "|".join(self.__bord_att)
         self.__bord_att = []
         write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
-        border_string))
+                                                border_string))
         self.__state = 'default'
         self.__bord_string = ''
         if self.__first_five == 'cw<bd':
             self. found_bd(line)
         else:
             write_obj.write(line)
+
     def add_to_border_desc(self, line):
         #cw<bt<bdr-hair__<nu<true
         #cw<bt<bdr-linew<nu<0.50
@@ -65,26 +70,22 @@ class CombineBorders:
         else:
             num = ':' + num
         self.__bord_att.append(border_desc + num)
+
     def __border_func(self, line, write_obj):
         if self.__first_five != 'cw<bt':
             self.end_border(line, write_obj)
         else:
             self.add_to_border_desc(line)
+
     def combine_borders(self):
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 'dummy'
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__first_five = line[0:5]
-            if self.__state == 'border':
-                self.__border_func(line, write_obj)
-            else:
-                to_print = self.__default_func(line)
-                write_obj.write(to_print)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    self.__first_five = line[0:5]
+                    if self.__state == 'border':
+                        self.__border_func(line, write_obj)
+                    else:
+                        write_obj.write(self.__default_func(line))
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "combine_borders.data")
diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
index ab54c0cbc3..6927537474 100755
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@@ -1,6 +1,9 @@
-import os, tempfile
-from calibre.ebooks.rtf2xml import copy
+import os, tempfile, sys
+
+from calibre.ebooks.rtf2xml import copy, check_encoding
+
 public_dtd = 'rtf2xml1.0.dtd'
+
 class ConvertToTags:
     """
     Convert file to XML
@@ -10,6 +13,7 @@ class ConvertToTags:
             bug_handler,
             dtd_path,
             no_dtd,
+            encoding,
             indent = None,
             copy = None,
             run_level = 1,
@@ -29,9 +33,14 @@ class ConvertToTags:
         self.__copy = copy
         self.__dtd_path = dtd_path
         self.__no_dtd = no_dtd
+        if encoding != 'mac_roman':
+            self.__encoding = 'cp' + encoding
+        else:
+            self.__encoding = 'mac_roman'
         self.__indent = indent
         self.__run_level = run_level
         self.__write_to = tempfile.mktemp()
+
     def __initiate_values(self):
         """
         Set values, including those for the dictionary.
@@ -61,6 +70,7 @@ class ConvertToTags:
         'tx<ut<__________'  :   self.__text_func,
         'mi<tg<empty_____'  :   self.__empty_func,
         }
+
     def __open_func(self, line):
         """
         Print the opening tag and newlines when needed.
@@ -73,6 +83,7 @@ class ConvertToTags:
         if info in self.__two_new_line:
             self.__write_extra_new_line()
         self.__write_obj.write('<%s>' % info)
+
     def __empty_func(self, line):
         """
         Print out empty tag and newlines when needed.
@@ -85,10 +96,11 @@ class ConvertToTags:
             self.__write_new_line()
         if info in self.__two_new_line:
             self.__write_extra_new_line()
+
     def __open_att_func(self, line):
         """
         Process lines for open tags that have attributes.
-        The important infor is between [17:-1]. Take this info and split it
+        The important info is between [17:-1]. Take this info and split it
         with the delimeter '<'. The first token in this group is the element
         name. The rest are attributes, separated fromt their values by '>'. So
         read each token one at a time, and split them by '>'.
@@ -119,6 +131,7 @@ class ConvertToTags:
             self.__write_new_line()
         if element_name in self.__two_new_line:
             self.__write_extra_new_line()
+
     def __empty_att_func(self, line):
         """
         Same as the __open_att_func, except a '/' is placed at the end of the tag.
@@ -143,6 +156,7 @@ class ConvertToTags:
             self.__write_new_line()
         if element_name in self.__two_new_line:
             self.__write_extra_new_line()
+
     def __close_func(self, line):
         """
         Print out the closed tag and new lines, if appropriate.
@@ -156,6 +170,7 @@ class ConvertToTags:
             self.__write_new_line()
         if info in self.__two_new_line:
             self.__write_extra_new_line()
+
     def __text_func(self, line):
         """
         Simply print out the information between [17:-1]
@@ -163,6 +178,7 @@ class ConvertToTags:
         #tx<nu<__________<Normal;
         # change this!
         self.__write_obj.write(line[17:-1])
+
     def __write_extra_new_line(self):
         """
         Print out extra new lines if the new lines have not exceeded two. If
@@ -172,8 +188,10 @@ class ConvertToTags:
             return
         if self.__new_line < 2:
             self.__write_obj.write('\n')
+
     def __default_func(self, line):
         pass
+
     def __write_new_line(self):
         """
         Print out a new line if a new line has not already been printed out.
@@ -183,11 +201,23 @@ class ConvertToTags:
         if not self.__new_line:
             self.__write_obj.write('\n')
             self.__new_line += 1
+
     def __write_dec(self):
         """
         Write the XML declaration at the top of the document.
         """
-        self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        #keep maximum compatibility with previous version
+        check_encoding_obj = check_encoding.CheckEncoding(
+                    bug_handler=self.__bug_handler)
+
+        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+            self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
+        else:
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
+                    ' hope for the best')
         self.__new_line = 0
         self.__write_new_line()
         if self.__no_dtd:
@@ -207,6 +237,7 @@ class ConvertToTags:
             )
         self.__new_line = 0
         self.__write_new_line()
+
     def convert_to_tags(self):
         """
         Read in the file one line at a time. Get the important info, between
@@ -222,18 +253,14 @@ class ConvertToTags:
             an empty tag function.
             """
         self.__initiate_values()
-        read_obj = open(self.__file, 'r')
         self.__write_obj = open(self.__write_to, 'w')
         self.__write_dec()
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__token_info)
-            if action != None:
-                action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__state_dict.get(self.__token_info)
+                if action is not None:
+                    action(line)
         self.__write_obj.close()
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py
index ff029c1841..1b620b9fbf 100755
--- a/src/calibre/ebooks/rtf2xml/copy.py
+++ b/src/calibre/ebooks/rtf2xml/copy.py
@@ -23,6 +23,7 @@ class Copy:
     def __init__(self, bug_handler, file = None, deb_dir = None, ):
         self.__file = file
         self.__bug_handler = bug_handler
+
     def set_dir(self, deb_dir):
         """Set the temporary directory to write files to"""
         if deb_dir is None:
@@ -33,19 +34,11 @@ class Copy:
             message = "%(deb_dir)s is not a directory" % vars()
             raise self.__bug_handler , message
         Copy.__dir = deb_dir
+
     def remove_files(self ):
         """Remove files from directory"""
         self.__remove_the_files(Copy.__dir)
-        """
-        list_of_files = os.listdir(Copy.__dir)
-        list_of_files = os.listdir(the_dir)
-        for file in list_of_files:
-            rem_file = os.path.join(Copy.__dir,file)
-            if os.path.isdir(rem_file):
-                self.remove_files(rem_file)
-            else:
-                os.remove(rem_file)
-        """
+
     def __remove_the_files(self, the_dir):
         """Remove files from directory"""
         list_of_files = os.listdir(the_dir)
@@ -58,6 +51,7 @@ class Copy:
                     os.remove(rem_file)
                 except OSError:
                     pass
+
     def copy_file(self, file, new_file):
         """
         Copy the file to a new name
diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py
index b932b465d0..53887e0d90 100755
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@@ -1,61 +1,142 @@
 #########################################################################
 #                                                                       #
-#                                                                       #
 #   copyright 2002 Paul Henry Tremblay                                  #
 #                                                                       #
-#   This program is distributed in the hope that it will be useful,     #
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
-#   General Public License for more details.                            #
-#                                                                       #
-#   You should have received a copy of the GNU General Public License   #
-#   along with this program; if not, write to the Free Software         #
-#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
-#   02111-1307 USA                                                      #
-#                                                                       #
-#                                                                       #
 #########################################################################
+
+'''
+Codepages as to RTF 1.9.1:
+    437	United States IBM
+    708	Arabic (ASMO 708)
+    709	Arabic (ASMO 449+, BCON V4)
+    710	Arabic (transparent Arabic)
+    711	Arabic (Nafitha Enhanced)
+    720	Arabic (transparent ASMO)
+    819	Windows 3.1 (United States and Western Europe)
+    850	IBM multilingual
+    852	Eastern European
+    860	Portuguese
+    862	Hebrew
+    863	French Canadian
+    864	Arabic
+    865	Norwegian
+    866	Soviet Union
+    874	Thai
+    932	Japanese
+    936	Simplified Chinese
+    949	Korean
+    950	Traditional Chinese
+    1250	Eastern European
+    1251	Cyrillic
+    1252	Western European
+    1253	Greek
+    1254	Turkish
+    1255	Hebrew
+    1256	Arabic
+    1257	Baltic
+    1258	Vietnamese
+    1361	Johab
+    10000	MAC Roman
+    10001	MAC Japan
+    10004	MAC Arabic
+    10005	MAC Hebrew
+    10006	MAC Greek
+    10007	MAC Cyrillic
+    10029	MAC Latin2
+    10081	MAC Turkish
+    57002	Devanagari
+    57003	Bengali
+    57004	Tamil
+    57005	Telugu
+    57006	Assamese
+    57007	Oriya
+    57008	Kannada
+    57009	Malayalam
+    57010	Gujarati
+    57011	Punjabi
+'''
+import re
+
 class DefaultEncoding:
     """
     Find the default encoding for the doc
     """
-    def __init__(self, in_file, bug_handler, run_level = 1,):
-        """
-        Required:
-            'file'
-        Returns:
-            nothing
-            """
+    def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
         self.__file = in_file
         self.__bug_handler = bug_handler
+        self.__platform = 'Windows'
+        self.__default_num = 'not-defined'
+        self.__code_page = '1252'
+        self.__datafetched = False
+        self.__fetchraw = check_raw
+
     def find_default_encoding(self):
-        platform = 'Windows'
-        default_num = 'not-defined'
-        code_page = 'ansicpg1252'
-        read_obj = open(self.__file, 'r')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'mi<mk<rtfhed-end':
-                break
-            if self.__token_info == 'cw<ri<ansi-codpg':
-                #cw<ri<ansi-codpg<nu<10000
-                num = line[20:-1]
-                if not num:
-                    num = '1252'
-                code_page = 'ansicpg' + num
-            if self.__token_info == 'cw<ri<macintosh_':
-                platform = 'Macintosh'
-            if self.__token_info == 'cw<ri<deflt-font':
-                default_num = line[20:-1]
-                #cw<ri<deflt-font<nu<0
-            #action = self.__state_dict.get(self.__state)
-            #if action == None:
-                #print self.__state
-            #action(line)
-        read_obj.close()
-        if platform == 'Macintosh':
-            code_page = 'mac_roman'
-        return platform, code_page, default_num
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        if self.__platform == 'Macintosh':
+            code_page = self.__code_page
+        else:
+            code_page = 'ansicpg' + self.__code_page
+        return self.__platform, code_page, self.__default_num
+
+    def get_codepage(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        return self.__code_page
+
+    def get_platform(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        return self.__platform
+
+    def _encoding(self):
+        with open(self.__file, 'r') as read_obj:
+            if not self.__fetchraw:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'mi<mk<rtfhed-end':
+                        break
+                    if self.__token_info == 'cw<ri<ansi-codpg':
+                        #cw<ri<ansi-codpg<nu<10000
+                        self.__code_page = line[20:-1] if int(line[20:-1]) \
+                                            else '1252'
+                    if self.__token_info == 'cw<ri<macintosh_':
+                        self.__platform = 'Macintosh'
+                        self.__code_page = 'mac_roman'
+                    elif self.__token_info == 'cw<ri<pc________':
+                        self.__platform = 'IBMPC'
+                        self.__code_page = '437'
+                    elif self.__token_info == 'cw<ri<pca_______':
+                        self.__platform = 'OS/2'
+                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<deflt-font':
+                        self.__default_num = line[20:-1]
+                        #cw<ri<deflt-font<nu<0
+            else:
+                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
+                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+                for line in read_obj:
+                    if fenccp.search(line):
+                        cp = fenccp.search(line).group(1)
+                        if not int(cp):
+                            self.__code_page = cp
+                        break
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
+                        if enc == 'mac':
+                            self.__code_page = 'mac_roman'
+                        elif enc == 'pc':
+                            self.__code_page = '437'
+                        elif enc == 'pca':
+                            self.__code_page = '850'
+
+# if __name__ == '__main__':
+    # encode_obj = DefaultEncoding(
+            # in_file = sys.argv[1],
+            # bug_handler = Exception,
+            # check_raw = True,
+            # )
+    # print encode_obj.get_codepage()
diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py
index f79caa3aae..fed47b1e75 100755
--- a/src/calibre/ebooks/rtf2xml/delete_info.py
+++ b/src/calibre/ebooks/rtf2xml/delete_info.py
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class DeleteInfo:
     """Delelet unecessary destination groups"""
     def __init__(self,
@@ -29,17 +31,18 @@ class DeleteInfo:
         self.__bug_handler = bug_handler
         self.__copy = copy
         self.__write_to = tempfile.mktemp()
-        self.__bracket_count=0
+        self.__bracket_count= 0
         self.__ob_count = 0
         self.__cb_count = 0
-        self.__after_asterisk = 0
-        self.__delete = 0
+        # self.__after_asterisk = False
+        # self.__delete = 0
         self.__initiate_allow()
         self.__ob = 0
-        self.__write_cb = 0
+        self.__write_cb = False
         self.__run_level = run_level
-        self.__found_delete = 0
-        self.__list = 0
+        self.__found_delete = False
+        # self.__list = False
+
     def __initiate_allow(self):
         """
         Initiate a list of destination groups which should be printed out.
@@ -66,9 +69,10 @@ class DeleteInfo:
         self.__state_dict = {
             'default'           : self.__default_func,
             'after_asterisk'    : self.__asterisk_func,
-            'delete'           : self.__delete_func,
+            'delete'            : self.__delete_func,
             'list'              : self.__list_func,
         }
+
     def __default_func(self,line):
         """Handle lines when in no special state. Look for an asterisk to
         begin a special state. Otherwise, print out line."""
@@ -81,27 +85,29 @@ class DeleteInfo:
             if self.__ob:
                 self.__write_obj.write(self.__ob)
             self.__ob = line
-            return 0
+            return False
         else:
             # write previous bracket, since didn't fine asterisk
             if self.__ob:
                 self.__write_obj.write(self.__ob)
                 self.__ob = 0
-            return 1
+            return True
+
     def __delete_func(self,line):
         """Handle lines when in delete state. Don't print out lines
         unless the state has ended."""
         if self.__delete_count == self.__cb_count:
             self.__state = 'default'
             if self.__write_cb:
-                self.__write_cb = 0
-                return 1
-            return 0
+                self.__write_cb = True
+                return True
+            return False
+
     def __asterisk_func(self,line):
         """
         Determine whether to delete info in group
         Note on self.__cb flag.
-        If you find that you are in a delete group, and the preivous
+        If you find that you are in a delete group, and the previous
         token in not an open bracket (self.__ob = 0), that means
         that the delete group is nested inside another acceptable
         detination group. In this case, you have alrady written
@@ -110,21 +116,21 @@ class DeleteInfo:
         """
         # Test for {\*}, in which case don't enter
         # delete state
-        self.__after_asterisk = 0 # only enter this function once
-        self.__found_delete = 1
+        # self.__after_asterisk = False # only enter this function once
+        self.__found_delete = True
         if self.__token_info == 'cb<nu<clos-brack':
             if self.__delete_count == self.__cb_count:
                 self.__state = 'default'
                 self.__ob = 0
                 # changed this because haven't printed out start
-                return 0
+                return False
             else:
                 # not sure what happens here!
                 # believe I have a '{\*}
                 if self.__run_level > 3:
                     msg = 'flag problem\n'
                     raise self.__bug_handler, msg
-                return 1
+                return True
         elif self.__token_info in self.__allowable :
             if self.__ob:
                 self.__write_obj.write(self.__ob)
@@ -132,85 +138,81 @@ class DeleteInfo:
                 self.__state = 'default'
             else:
                 pass
-            return 1
+            return True
         elif self.__token_info == 'cw<ls<list______':
             self.__ob = 0
             self.__found_list_func(line)
         elif self.__token_info in self.__not_allowable:
             if not self.__ob:
-                self.__write_cb = 1
+                self.__write_cb = True
             self.__ob = 0
             self.__state = 'delete'
             self.__cb_count = 0
-            return 0
+            return False
         else:
             if self.__run_level > 5:
-                msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
-                msg += 'token is "%s"\n' % self.__token_info
-                raise self.__bug_handler
+                msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
+                            token is "%s"\n') % self.__token_info
+                raise self.__bug_handler, msg
             if not self.__ob:
-                self.__write_cb = 1
+                self.__write_cb = True
             self.__ob = 0
             self.__state = 'delete'
             self.__cb_count = 0
-            return 0
+            return False
+
     def __found_list_func(self, line):
         """
         print out control words in this group
         """
         self.__state = 'list'
+
     def __list_func(self, line):
         """
         Check to see if the group has ended.
-        Return 1 for all control words.
-        Return 0 otherwise.
+        Return True for all control words.
+        Return False otherwise.
         """
         if self.__delete_count == self.__cb_count and self.__token_info ==\
             'cb<nu<clos-brack':
             self.__state = 'default'
             if self.__write_cb:
-                self.__write_cb = 0
-                return 1
-            return 0
+                self.__write_cb = False
+                return True
+            return False
         elif line[0:2] == 'cw':
-            return 1
+            return True
         else:
-            return 0
+            return False
+
     def delete_info(self):
         """Main method for handling other methods. Read one line in at
-        a time, and determine wheter to print the line based on the state."""
-        line_to_read = 'dummy'
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        while line_to_read:
-            #ob<nu<open-brack<0001
-            to_print =1
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            action = self.__state_dict.get(self.__state)
-            if not action:
-                sys.stderr.write('No action in dictionary state is "%s" \n'
-                        % self.__state)
-            to_print = action(line)
-            """
-            if self.__after_asterisk:
-                to_print = self.__asterisk_func(line)
-            elif self.__list:
-                self.__in_list_func(line)
-            elif self.__delete:
-                to_print = self.__delete_func(line)
-            else:
-                to_print = self.__default_func(line)
-            """
-            if to_print:
-                self.__write_obj.write(line)
-        self.__write_obj.close()
-        read_obj.close()
+        a time, and determine whether to print the line based on the state."""
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    #ob<nu<open-brack<0001
+                    to_print = True
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if not action:
+                        sys.stderr.write(_('No action in dictionary state is "%s" \n')
+                                % self.__state)
+                    to_print = action(line)
+                    # if self.__after_asterisk:
+                        # to_print = self.__asterisk_func(line)
+                    # elif self.__list:
+                        # self.__in_list_func(line)
+                    # elif self.__delete:
+                        # to_print = self.__delete_func(line)
+                    # else:
+                        # to_print = self.__default_func(line)
+                    if to_print:
+                        self.__write_obj.write(line)
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "delete_info.data")
diff --git a/src/calibre/ebooks/rtf2xml/footnote.py b/src/calibre/ebooks/rtf2xml/footnote.py
index a596ca73f6..6ac12f65e6 100755
--- a/src/calibre/ebooks/rtf2xml/footnote.py
+++ b/src/calibre/ebooks/rtf2xml/footnote.py
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class Footnote:
     """
     Two public methods are available. The first separates all of the
@@ -35,6 +37,7 @@ class Footnote:
         self.__copy = copy
         self.__write_to = tempfile.mktemp()
         self.__found_a_footnote = 0
+
     def __first_line_func(self, line):
         """
         Print the tag info for footnotes.  Check whether footnote is an
@@ -47,6 +50,7 @@ class Footnote:
             self.__write_to_foot_obj.write(
             'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
         self.__first_line = 0
+
     def __in_footnote_func(self, line):
         """Handle all tokens that are part of footnote"""
         if self.__first_line:
@@ -68,6 +72,7 @@ class Footnote:
             'mi<mk<footnt-clo\n')
         else:
             self.__write_to_foot_obj.write(line)
+
     def __found_footnote(self, line):
         """ Found a footnote"""
         self.__found_a_footnote = 1
@@ -81,6 +86,7 @@ class Footnote:
         'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
         self.__write_to_foot_obj.write(
         'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
+
     def __default_sep(self, line):
         """Handle all tokens that are not footnote tokens"""
         if self.__token_info == 'cw<nt<footnote__':
@@ -91,6 +97,7 @@ class Footnote:
             self.__write_obj.write(
                 'tx<nu<__________<%s\n' % num
             )
+
     def __initiate_sep_values(self):
         """
         initiate counters for separate_footnotes method.
@@ -102,6 +109,7 @@ class Footnote:
         self.__in_footnote = 0
         self.__first_line = 0 #have not processed the first line of footnote
         self.__footnote_count = 0
+
     def separate_footnotes(self):
         """
         Separate all the footnotes in an RTF file and put them at the bottom,
@@ -111,58 +119,50 @@ class Footnote:
         bottom of the main file.
         """
         self.__initiate_sep_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
         self.__footnote_holder = tempfile.mktemp()
-        self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            # keep track of opening and closing brackets
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            # In the middle of footnote text
-            if self.__in_footnote:
-                self.__in_footnote_func(line)
-            # not in the middle of footnote text
-            else:
-                self.__default_sep(line)
-        self.__write_obj.close()
-        read_obj.close()
-        self.__write_to_foot_obj.close()
-        read_obj = open(self.__footnote_holder, 'r')
-        write_obj = open(self.__write_to, 'a')
-        write_obj.write(
-        'mi<mk<sect-close\n'
-        'mi<mk<body-close\n'
-        'mi<tg<close_____<section\n'
-        'mi<tg<close_____<body\n'
-        'mi<tg<close_____<doc\n'
-        'mi<mk<footnt-beg\n')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        write_obj.write(
-        'mi<mk<footnt-end\n')
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_footnote:
+                            self.__in_footnote_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+        with open(self.__footnote_holder, 'r') as read_obj:
+            with open(self.__write_to, 'a') as write_obj:
+                write_obj.write(
+                    'mi<mk<sect-close\n'
+                    'mi<mk<body-close\n'
+                    'mi<tg<close_____<section\n'
+                    'mi<tg<close_____<body\n'
+                    'mi<tg<close_____<doc\n'
+                    'mi<mk<footnt-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<footnt-end\n')
         os.remove(self.__footnote_holder)
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "footnote_separate.data")
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
+
     def update_info(self, file, copy):
         """
         Unused method
         """
         self.__file = file
         self.__copy = copy
+
     def __get_foot_body_func(self, line):
         """
         Process lines in main body and look for beginning of footnotes.
@@ -172,6 +172,7 @@ class Footnote:
             self.__state = 'foot'
         else:
             self.__write_obj.write(line)
+
     def __get_foot_foot_func(self, line):
         """
         Copy footnotes from bottom of file to a separate, temporary file.
@@ -180,6 +181,7 @@ class Footnote:
             self.__state = 'body'
         else:
             self.__write_to_foot_obj.write(line)
+
     def __get_footnotes(self):
         """
         Private method to remove footnotes from main file.  Read one line from
@@ -188,21 +190,16 @@ class Footnote:
         These two functions do the work of separating the footnotes form the
         body.
         """
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-            # self.__write_to = "footnote_info.data"
-        self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__state == 'body':
-                self.__get_foot_body_func(line)
-            elif self.__state == 'foot':
-                self.__get_foot_foot_func(line)
-        read_obj.close()
-        self.__write_obj.close()
-        self.__write_to_foot_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_foot_body_func(line)
+                        elif self.__state == 'foot':
+                            self.__get_foot_foot_func(line)
+
     def __get_foot_from_temp(self, num):
         """
         Private method for joining footnotes to body. This method reads from
@@ -213,9 +210,7 @@ class Footnote:
         look_for = 'mi<mk<footnt-ope<' + num + '\n'
         found_foot = 0
         string_to_return = ''
-        line = 1
-        while line:
-            line = self.__read_from_foot_obj.readline()
+        for line in self.__read_from_foot_obj:
             if found_foot:
                 if line == 'mi<mk<footnt-clo\n':
                     return string_to_return
@@ -223,6 +218,7 @@ class Footnote:
             else:
                 if line == look_for:
                     found_foot = 1
+
     def __join_from_temp(self):
         """
         Private method for rejoining footnotes to body.  Read from the
@@ -232,16 +228,14 @@ class Footnote:
         print out to the third file.
         If no footnote marker is found, simply print out the token (line).
         """
-        self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
-        read_obj = open(self.__write_to, 'r')
-        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if line[:16] == 'mi<mk<footnt-ind':
-                line = self.__get_foot_from_temp(line[17:-1])
-            self.__write_obj.write(line)
-        read_obj.close()
+        with open(self.__footnote_holder, 'r') as self.__read_from_foot_obj:
+            with open(self.__write_to, 'r') as read_obj:
+                with open(self.__write_to2, 'w') as self.__write_obj:
+                    for line in read_obj:
+                        if line[:16] == 'mi<mk<footnt-ind':
+                            line = self.__get_foot_from_temp(line[17:-1])
+                        self.__write_obj.write(line)
+
     def join_footnotes(self):
         """
         Join the footnotes from the bottom of the file and put them in their
@@ -258,8 +252,8 @@ class Footnote:
         self.__state = 'body'
         self.__get_footnotes()
         self.__join_from_temp()
-        self.__write_obj.close()
-        self.__read_from_foot_obj.close()
+        # self.__write_obj.close()
+        # self.__read_from_foot_obj.close()
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py
index db307b19d6..fb3ef28b4f 100755
--- a/src/calibre/ebooks/rtf2xml/get_char_map.py
+++ b/src/calibre/ebooks/rtf2xml/get_char_map.py
@@ -43,27 +43,28 @@ class GetCharMap:
     def get_char_map(self, map):
         if map == 'ansicpg0':
             map = 'ansicpg1250'
-        found_map = 0
+        if map in ('ansicpg10000', '10000'):
+            map = 'mac_roman'
+        found_map = False
         map_dict = {}
         self.__char_file.seek(0)
-        for line in self.__char_file.readlines():
+        for line in self.__char_file:
             if not line.strip(): continue
             begin_element = '<%s>' % map;
             end_element = '</%s>' % map
             if not found_map:
                 if begin_element in line:
-                    found_map = 1
+                    found_map = True
             else:
                 if end_element in line:
                     break
                 fields = line.split(':')
                 fields[1].replace('\\colon', ':')
                 map_dict[fields[1]] = fields[3]
-            
-        
+
+
         if not found_map:
-            msg = 'no map found\n'
-            msg += 'map is "%s"\n'%(map,)
+            msg = 'no map found\nmap is "%s"\n'%(map,)
             raise self.__bug_handler, msg
         return map_dict
 
diff --git a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
index d67dce30d2..ba85174845 100755
--- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
+++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
@@ -54,10 +54,10 @@ class Hex2Utf8:
             'convert_to_caps'--wether to convert caps to utf-8
         Returns:
             nothing
-            """
+        """
         self.__file = in_file
         self.__copy = copy
-        if area_to_convert != 'preamble' and area_to_convert != 'body':
+        if area_to_convert not in ('preamble', 'body'):
             msg = (
             'Developer error! Wrong flag.\n'
             'in module "hex_2_utf8.py\n'
@@ -79,7 +79,8 @@ class Hex2Utf8:
         self.__write_to = tempfile.mktemp()
         self.__bug_handler = bug_handler
         self.__invalid_rtf_handler = invalid_rtf_handler
-    def update_values(  self,
+
+    def update_values(self,
                         file,
                         area_to_convert,
                         char_file,
@@ -132,6 +133,7 @@ class Hex2Utf8:
         # self.__convert_symbol = 0
         # self.__convert_wingdings = 0
         # self.__convert_zapf = 0
+
     def __initiate_values(self):
         """
         Required:
@@ -191,6 +193,7 @@ class Hex2Utf8:
             'body'          :       self.__body_func,
             'mi<mk<body-open_'  :   self.__found_body_func,
             'tx<hx<__________'  :   self.__hex_text_func,
+            # 'tx<nu<__________'  :   self.__text_func,
             }
         self.__body_state_dict = {
             'preamble'      :       self.__preamble_for_body_func,
@@ -209,6 +212,7 @@ class Hex2Utf8:
         }
         self.__caps_list = ['false']
         self.__font_list = ['not-defined']
+
     def __hex_text_func(self, line):
         """
         Required:
@@ -218,12 +222,12 @@ class Hex2Utf8:
             token is in the dictionary, then check if the value starts with a
             "&". If it does, then tag the result as utf text. Otherwise, tag it
             as normal text.
-            If the nex_num is not in the dictionary, then a mistake has been
+            If the hex_num is not in the dictionary, then a mistake has been
             made.
             """
         hex_num = line[17:-1]
         converted = self.__current_dict.get(hex_num)
-        if converted != None:
+        if converted is not None:
             # tag as utf-8
             if converted[0:1] == "&":
                 font = self.__current_dict_name
@@ -263,42 +267,43 @@ class Hex2Utf8:
                     # msg += 'dictionary is %s\n' % self.__current_dict_name
                     msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
                     raise self.__bug_handler, msg
+
     def __found_body_func(self, line):
         self.__state = 'body'
         self.__write_obj.write(line)
+
     def __body_func(self, line):
         """
         When parsing preamble
         """
         self.__write_obj.write(line)
+
     def __preamble_func(self, line):
         action = self.__preamble_state_dict.get(self.__token_info)
-        if action != None:
+        if action is not None:
             action(line)
         else:
             self.__write_obj.write(line)
+
     def __convert_preamble(self):
         self.__state = 'preamble'
-        read_obj = open(self.__file, 'r')
         self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__preamble_state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('error no state found in hex_2_utf8',
-                self.__state
-                )
-            action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+           for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__preamble_state_dict.get(self.__state)
+                if action is None:
+                    sys.stderr.write(_('error no state found in hex_2_utf8'),
+                    self.__state
+                    )
+                action(line)
         self.__write_obj.close()
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
+
     def __preamble_for_body_func(self, line):
         """
         Required:
@@ -311,6 +316,7 @@ class Hex2Utf8:
         if self.__token_info == 'mi<mk<body-open_':
             self.__found_body_func(line)
         self.__write_obj.write(line)
+
     def __body_for_body_func(self, line):
         """
         Required:
@@ -321,10 +327,11 @@ class Hex2Utf8:
             Used when parsing the body.
         """
         action = self.__in_body_dict.get(self.__token_info)
-        if action != None:
+        if action is not None:
             action(line)
         else:
             self.__write_obj.write(line)
+
     def __start_font_func(self, line):
         """
         Required:
@@ -348,6 +355,7 @@ class Hex2Utf8:
         else:
             self.__current_dict_name = 'default'
             self.__current_dict = self.__def_dict
+
     def __end_font_func(self, line):
         """
         Required:
@@ -376,6 +384,7 @@ class Hex2Utf8:
         else:
             self.__current_dict_name = 'default'
             self.__current_dict = self.__def_dict
+
     def __start_special_font_func_old(self, line):
         """
         Required:
@@ -398,6 +407,7 @@ class Hex2Utf8:
             self.__current_dict.append(self.__dingbats_dict)
             self.__special_fonts_found += 1
             self.__current_dict_name = 'Zapf Dingbats'
+
     def __end_special_font_func(self, line):
         """
         Required:
@@ -416,6 +426,7 @@ class Hex2Utf8:
             self.__current_dict.pop()
             self.__special_fonts_found -= 1
             self.__dict_name = 'default'
+
     def __start_caps_func_old(self, line):
         """
         Required:
@@ -427,6 +438,7 @@ class Hex2Utf8:
             self.__in_caps to 1
         """
         self.__in_caps = 1
+
     def __start_caps_func(self, line):
         """
         Required:
@@ -440,6 +452,7 @@ class Hex2Utf8:
         self.__in_caps = 1
         value = line[17:-1]
         self.__caps_list.append(value)
+
     def __end_caps_func(self, line):
         """
         Required:
@@ -455,7 +468,8 @@ class Hex2Utf8:
         else:
             sys.stderr.write('Module is hex_2_utf8\n')
             sys.stderr.write('method is __end_caps_func\n')
-            sys.stderr.write('caps list should be more than one?\n')
+            sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set
+
     def __text_func(self, line):
         """
         Required:
@@ -466,9 +480,8 @@ class Hex2Utf8:
             if in caps, convert. Otherwise, print out.
         """
         text = line[17:-1]
-        if self.__current_dict_name == 'Symbol'\
-          or self.__current_dict_name == 'Wingdings'\
-          or self.__current_dict_name == 'Zapf Dingbats':
+        # print line
+        if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
             the_string = ''
             for letter in text:
                 hex_num = hex(ord(letter))
@@ -477,21 +490,21 @@ class Hex2Utf8:
                 hex_num = hex_num[2:]
                 hex_num = '\'%s' % hex_num
                 converted = self.__current_dict.get(hex_num)
-                if converted == None:
+                if converted is None:
                     sys.stderr.write('module is hex_2_ut8\n')
                     sys.stderr.write('method is __text_func\n')
                     sys.stderr.write('no hex value for "%s"\n' % hex_num)
                 else:
                     the_string += converted
             self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
+            # print the_string
         else:
             if self.__caps_list[-1] == 'true' \
                 and self.__convert_caps\
-                and self.__current_dict_name != 'Symbol'\
-                and self.__current_dict_name != 'Wingdings'\
-                and self.__current_dict_name != 'Zapf Dingbats':
+                and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
                 text = text.upper()
             self.__write_obj.write('tx<nu<__________<%s\n' % text)
+
     def __utf_to_caps_func(self, line):
         """
         Required:
@@ -506,6 +519,7 @@ class Hex2Utf8:
             # utf_text = utf_text.upper()
             utf_text = self.__utf_token_to_caps_func(utf_text)
         self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
+
     def __utf_token_to_caps_func(self, char_entity):
         """
         Required:
@@ -530,28 +544,26 @@ class Hex2Utf8:
             return char_entity
         else:
             return converted
+
     def __convert_body(self):
         self.__state = 'body'
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__body_state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('error no state found in hex_2_utf8',
-                self.__state
-                )
-            action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            self.__write_obj = open(self.__write_to, 'w')
+            for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__body_state_dict.get(self.__state)
+                if action is None:
+                    sys.stderr.write('error no state found in hex_2_utf8',
+                    self.__state
+                    )
+                action(line)
         self.__write_obj.close()
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
+
     def convert_hex_2_utf8(self):
         self.__initiate_values()
         if self.__area_to_convert == 'preamble':
diff --git a/src/calibre/ebooks/rtf2xml/inline.py b/src/calibre/ebooks/rtf2xml/inline.py
index 5ca1cd0783..7eda0ce429 100755
--- a/src/calibre/ebooks/rtf2xml/inline.py
+++ b/src/calibre/ebooks/rtf2xml/inline.py
@@ -1,5 +1,7 @@
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 """
 States.
 1. default
@@ -36,6 +38,7 @@ class Inline:
         self.__copy = copy
         self.__run_level = run_level
         self.__write_to = tempfile.mktemp()
+
     def __initiate_values(self):
         """
         Initiate all values.
@@ -51,7 +54,6 @@ class Inline:
             'tx<ut<__________'  :       self.__found_text_func,
             'mi<mk<inline-fld'  :       self.__found_text_func,
             'text'              :       self.__found_text_func,
-            'cw<nu<hard-lineb'  :       self.__found_text_func, #calibre
             'cb<nu<clos-brack'  :       self.__close_bracket_func,
             'mi<mk<par-end___'  :       self.__end_para_func,
             'mi<mk<footnt-ope'  :       self.__end_para_func,
@@ -63,7 +65,6 @@ class Inline:
             'tx<hx<__________'  :       self.__found_text_func,
             'tx<ut<__________'  :       self.__found_text_func,
             'text'              :       self.__found_text_func,
-            'cw<nu<hard-lineb'  :       self.__found_text_func, #calibre
             'mi<mk<inline-fld'  :       self.__found_text_func,
             'ob<nu<open-brack':         self.__found_open_bracket_func,
             'mi<mk<par-end___'  :       self.__end_para_func,
@@ -83,12 +84,12 @@ class Inline:
         self.__in_para = 0 #  not in paragraph
         self.__char_dict = {
             # character info => ci
-            'annotation'    :       'annotation',
+            'annotation'    :   'annotation',
             'blue______'    :   'blue',
             'bold______'    :   'bold',
-            'caps______'    :       'caps',
-            'char-style'    :       'character-style',
-            'dbl-strike'    :    'double-strike-through',
+            'caps______'    :   'caps',
+            'char-style'    :   'character-style',
+            'dbl-strike'    :   'double-strike-through',
             'emboss____'    :   'emboss',
             'engrave___'    :   'engrave',
             'font-color'    :   'font-color',
@@ -96,7 +97,7 @@ class Inline:
             'font-size_'    :   'font-size',
             'font-style'    :   'font-style',
             'font-up___'    :   'superscript',
-            'footnot-mk'    :       'footnote-marker',
+            'footnot-mk'    :   'footnote-marker',
             'green_____'    :   'green',
             'hidden____'    :   'hidden',
             'italics___'    :   'italics',
@@ -107,9 +108,10 @@ class Inline:
             'strike-thr'    :   'strike-through',
             'subscript_'    :   'subscript',
             'superscrip'    :   'superscript',
-            'underlined'    :       'underlined',
+            'underlined'    :   'underlined',
         }
         self.__caps_list = ['false']
+
     def __set_list_func(self, line):
         """
         Requires:
@@ -128,6 +130,7 @@ class Inline:
                 self.__place = 'in_list'
                 self.__inline_list = self.__list_inline_list
                 self.__groups_in_waiting = self.__groups_in_waiting_list
+
     def __default_func(self, line):
         """
         Requires:
@@ -140,8 +143,8 @@ class Inline:
         action = self.__default_dict.get(self.__token_info)
         if action:
             action(line)
-        if self.__token_info != 'cw<nu<hard-lineb': #calibre
-            self.__write_obj.write(line)
+        self.__write_obj.write(line)
+
     def __found_open_bracket_func(self, line):
         """
         Requires:
@@ -156,6 +159,7 @@ class Inline:
         self.__groups_in_waiting[0] += 1
         self.__inline_list.append({})
         self.__inline_list[-1]['contains_inline'] = 0
+
     def __after_open_bracket_func(self, line):
         """
         Requires:
@@ -176,6 +180,7 @@ class Inline:
                 self.__state = 'default' #  a non control word?
                 action(line)
         self.__write_obj.write(line)
+
     def __handle_control_word(self, line):
         """
         Required:
@@ -206,6 +211,7 @@ class Inline:
                 elif char_value == 'Zapf Dingbats':
                     self.__write_obj.write('mi<mk<font-dingb\n')
             """
+
     def __close_bracket_func(self, line):
         """
         Requires:
@@ -244,6 +250,7 @@ class Inline:
         self.__inline_list.pop()
         if self.__groups_in_waiting[0] != 0:
             self.__groups_in_waiting[0] -= 1
+
     def __found_text_func(self, line):
         """
         Required:
@@ -257,7 +264,6 @@ class Inline:
                 Text can mark the start of a paragraph.
                 If already in a paragraph, check to see if any groups are waiting
                 to be added. If so, use another method to write these groups.
-            3. If not check if hardline break, then write
         """
         if self.__place == 'in_list':
             self.__write_inline()
@@ -265,12 +271,9 @@ class Inline:
             if not self.__in_para:
                 self.__in_para = 1
                 self.__start_para_func(line)
-            else:
-                if self.__token_info == 'cw<nu<hard-lineb': #calibre
-                    self.__write_obj.write('mi<tg<empty_____<hardline-break\n')
-                if self.__groups_in_waiting[0] != 0:
+            elif self.__groups_in_waiting[0] != 0:
                     self.__write_inline()
-                
+
     def __write_inline(self):
         """
         Required:
@@ -314,6 +317,7 @@ class Inline:
                             self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                     self.__write_obj.write('\n')
         self.__groups_in_waiting[0] = 0
+
     def __end_para_func(self, line):
         """
         Requires:
@@ -342,6 +346,7 @@ class Inline:
                     self.__write_obj.write('mi<mk<caps-end__\n')
                 self.__write_obj.write('mi<tg<close_____<inline\n')
         self.__in_para = 0
+
     def __start_para_func(self, line):
         """
         Requires:
@@ -369,12 +374,14 @@ class Inline:
                         self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                 self.__write_obj.write('\n')
         self.__groups_in_waiting[0] = 0
+
     def __found_field_func(self, line):
         """
         Just a default function to make sure I don't prematurely exit
         default state
         """
         pass
+
     def form_tags(self):
         """
         Requires:
@@ -386,32 +393,27 @@ class Inline:
             the state.
         """
         self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            token = line[0:-1]
-            self.__token_info = ''
-            if token == 'tx<mc<__________<rdblquote'\
-                or token == 'tx<mc<__________<ldblquote'\
-                or token == 'tx<mc<__________<lquote'\
-                or token == 'tx<mc<__________<rquote'\
-                or token == 'tx<mc<__________<emdash'\
-                or token == 'tx<mc<__________<endash'\
-                or token == 'tx<mc<__________<bullet':
-                self.__token_info = 'text'
-            else:
-                self.__token_info = line[:16]
-            self.__set_list_func(line)
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('No matching state in module inline_for_lists.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    token = line[0:-1]
+                    self.__token_info = ''
+                    if token == 'tx<mc<__________<rdblquote'\
+                        or token == 'tx<mc<__________<ldblquote'\
+                        or token == 'tx<mc<__________<lquote'\
+                        or token == 'tx<mc<__________<rquote'\
+                        or token == 'tx<mc<__________<emdash'\
+                        or token == 'tx<mc<__________<endash'\
+                        or token == 'tx<mc<__________<bullet':
+                        self.__token_info = 'text'
+                    else:
+                        self.__token_info = line[:16]
+                    self.__set_list_func(line)
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module inline_for_lists.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "inline.data")
diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py
index 543ae5dd83..dfc482d981 100755
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@@ -15,8 +15,11 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import os, tempfile, re
+import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.cleantext import clean_ascii_chars
+
 class FixLineEndings:
     """Fix line endings"""
     def __init__(self,
@@ -32,36 +35,23 @@ class FixLineEndings:
         self.__run_level = run_level
         self.__write_to = tempfile.mktemp()
         self.__replace_illegals = replace_illegals
+
     def fix_endings(self):
-        ##tempFileName = tempfile.mktemp()
-        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
-        #nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  14, 15, 16, 17, 18, 19]
-        """
-read_obj = open(self.__file, 'r')
-line = read_obj.read(1000)
-regexp = re.compile(r"\r")
-macintosh = regexp.search(line)
-read_obj.close()
-        """
-        # always check since I have to get rid of illegal characters
-        macintosh = 1
-        if macintosh:
-            line = 1
-            read_obj = open(self.__file, 'r')
-            write_obj = open(self.__write_to, 'w')
-            while line:
-                line = read_obj.read(1000)
-                # line = re.sub(regexp,"\n",line)
-                line = line.replace ('\r', '\n')
-                if self.__replace_illegals:
-                    line = re.sub(illegal_regx, '', line)
-                    # for num in nums:
-                        # line = line.replace(chr(num), '')
-                write_obj.write(line )
-            read_obj.close()
-            write_obj.close()
-            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
-            if self.__copy:
-                copy_obj.copy_file(self.__write_to, "line_endings.data")
-            copy_obj.rename(self.__write_to, self.__file)
-            os.remove(self.__write_to)
+        #read
+        with open(self.__file, 'r') as read_obj:
+            input_file = read_obj.read()
+        #calibre go from win and mac to unix
+        input_file = input_file.replace ('\r\n', '\n')
+        input_file = input_file.replace ('\r', '\n')
+        #remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
+        if self.__replace_illegals:
+            input_file = clean_ascii_chars(input_file)
+        #write
+        with open(self.__write_to, 'wb') as write_obj:
+            write_obj.write(input_file)
+        #copy
+        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "line_endings.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py
index 3a1044520e..c8a2e7e84a 100755
--- a/src/calibre/ebooks/rtf2xml/pict.py
+++ b/src/calibre/ebooks/rtf2xml/pict.py
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class Pict:
     """Process graphic information"""
     def __init__(self,
@@ -36,13 +38,11 @@ class Pict:
         self.__ob_count = 0
         self.__cb_count = 0
         self.__pict_count = 0
-        self.__in_pict = 0
-        self.__already_found_pict = 0
+        self.__in_pict = False
+        self.__already_found_pict = False
         self.__orig_file = orig_file
         self.__initiate_pict_dict()
         self.__out_file = out_file
-        # this is left over
-        self.__no_ask = 1
 
     def __initiate_pict_dict(self):
         self.__pict_dict = {
@@ -71,57 +71,43 @@ class Pict:
                 self.__out_file))
         else:
             dir_name = os.path.dirname(self.__orig_file)
-        # self.__output_to_file_func()
         self.__dir_name = base_name + "_rtf_pict_dir/"
         self.__dir_name = os.path.join(dir_name, self.__dir_name)
         if not os.path.isdir(self.__dir_name):
             try:
                 os.mkdir(self.__dir_name)
             except OSError, msg:
-                msg = str(msg)
-                msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
+                msg = "%sCouldn't make directory '%s':\n" % (str(msg), self.__dir_name)
                 raise self.__bug_handler
         else:
-            if self.__no_ask:
-                user_response = 'r'
-            else:
-                msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
-                msg += 'Type "r" to remove.\n'
-                msg +=  'Type any other key to keep files in place.\n'
-                sys.stderr.write(msg)
-                user_response = raw_input()
-            if user_response == 'r':
-                if self.__run_level > 1:
-                    sys.stderr.write('Removing files from old pict directory...\n')
-                all_files = os.listdir(self.__dir_name)
-                for the_file in all_files:
-                    the_file = os.path.join(self.__dir_name, the_file)
-                    try:
-                        os.remove(the_file)
-                    except OSError:
-                        pass
-                if self.__run_level > 1:
-                    sys.stderr.write('Files removed.\n')
+            if self.__run_level > 1:
+                sys.stderr.write('Removing files from old pict directory...\n')
+            all_files = os.listdir(self.__dir_name)
+            for the_file in all_files:
+                the_file = os.path.join(self.__dir_name, the_file)
+                try:
+                    os.remove(the_file)
+                except OSError:
+                    pass
+            if self.__run_level > 1:
+                sys.stderr.write('Files removed.\n')
 
     def __create_pict_file(self):
         """Create a file for all the pict data to be written to.
         """
         self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
-        write_pic_obj = open(self.__pict_file, 'w')
-        write_pic_obj.close()
         self.__write_pic_obj = open(self.__pict_file, 'a')
 
     def __in_pict_func(self, line):
         if self.__cb_count == self.__pict_br_count:
-            self.__in_pict = 0
+            self.__in_pict = False
             self.__write_pic_obj.write("}\n")
-            return 1
+            return True
         else:
             action = self.__pict_dict.get(self.__token_info)
             if action:
-                line = action(line)
-                self.__write_pic_obj.write(line)
-            return 0
+                self.__write_pic_obj.write(action(line))
+            return False
 
     def __default(self, line, write_obj):
         """Determine if each token marks the beginning of pict data.
@@ -142,53 +128,50 @@ class Pict:
             write_obj.write('mi<mk<pict-end__\n')
             if not self.__already_found_pict:
                 self.__create_pict_file()
-                self.__already_found_pict=1;
+                self.__already_found_pict=True;
                 self.__print_rtf_header()
             self.__in_pict = 1
             self.__pict_br_count = self.__ob_count
             self.__cb_count = 0
             self.__write_pic_obj.write("{\\pict\n")
-            return 0
-        return 1
+            return False
+        return True
 
     def __print_rtf_header(self):
         """Print to pict file the necessary RTF data for the file to be
         recognized as an RTF file.
         """
-        self.__write_pic_obj.write("{\\rtf1 \n")
-        self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
-        self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
-        self.__write_pic_obj.write("\\pard \n")
+        self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
+        self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
 
     def process_pict(self):
         self.__make_dir()
-        read_obj = open(self.__file)
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 'dummy'
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            if not self.__in_pict:
-                to_print = self.__default(line, write_obj)
-                if to_print :
-                    write_obj.write(line)
-            else:
-                to_print = self.__in_pict_func(line)
-                if to_print :
-                    write_obj.write(line)
-        if self.__already_found_pict:
-            self.__write_pic_obj.write("}\n")
-            self.__write_pic_obj.close()
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    if not self.__in_pict:
+                        to_print = self.__default(line, write_obj)
+                        if to_print :
+                            write_obj.write(line)
+                    else:
+                        to_print = self.__in_pict_func(line)
+                        if to_print :
+                            write_obj.write(line)
+                if self.__already_found_pict:
+                    self.__write_pic_obj.write("}\n")
+                    self.__write_pic_obj.close()
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "pict.data")
+            try:
+                copy_obj.copy_file(self.__pict_file, "pict.rtf")
+            except:
+                pass
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
         if self.__pict_count == 0:
diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py
index 19a7d38135..9460af07fc 100755
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import os, re,  tempfile
+import os, re, tempfile
+
 from calibre.ebooks.rtf2xml import copy, check_brackets
+
 class ProcessTokens:
     """
     Process each token on a line and add information that will be useful for
@@ -41,14 +43,16 @@ class ProcessTokens:
         self.__bracket_count=0
         self.__exception_handler = exception_handler
         self.__bug_handler = bug_handler
+
     def compile_expressions(self):
         self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
         self.__utf_exp = re.compile(r'(&.*?;)')
+
     def initiate_token_dict(self):
         self.__return_code = 0
         self.dict_token={
         # unicode
-        'mshex'             :   ('nu', '__________', self.__ms_hex_func),
+        'mshex'              :  ('nu', '__________', self.__ms_hex_func),
         # brackets
         '{'                  :	('nu', '{', self.ob_func),
         '}'                  :	('nu', '}', self.cb_func),
@@ -66,6 +70,7 @@ class ProcessTokens:
         ';'                  :	('mc', ';', self.ms_sub_func),
         # this must be wrong
         '-'                  :	('mc', '-', self.ms_sub_func),
+        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
         # misc => ml
         '*'                  :	('ml', 'asterisk__', self.default_func),
         ':'                  :	('ml', 'colon_____', self.default_func),
@@ -73,7 +78,6 @@ class ProcessTokens:
         'backslash'          :	('nu', '\\', self.text_func),
         'ob'                 :	('nu', '{', self.text_func),
         'cb'                 :	('nu', '}', self.text_func),
-        'line'               :  ('nu', 'hard-lineb', self.default_func), #calibre
         #'line'               :  ('nu', ' ', self.text_func), calibre
         # paragraph formatting => pf
         'page'               :  ('pf', 'page-break', self.default_func),
@@ -159,15 +163,17 @@ class ProcessTokens:
         'rtf'                :	('ri', 'rtf_______', self.default_func),
         'deff'               :	('ri', 'deflt-font', self.default_func),
         'mac'                :	('ri', 'macintosh_', self.default_func),
+        'pc'                 :	('ri', 'pc________', self.default_func),
+        'pca'                :	('ri', 'pca_______', self.default_func),
         'ansi'               :	('ri', 'ansi______', self.default_func),
         'ansicpg'            :	('ri', 'ansi-codpg', self.default_func),
         # notes => nt
         'footnote'           :	('nt', 'footnote__', self.default_func),
         'ftnalt'             :	('nt', 'type______<endnote', self.two_part_func),
         # anchor => an
-        'tc'                :	('an', 'toc_______', self.default_func),
+        'tc'                 :	('an', 'toc_______', self.default_func),
         'bkmkstt'            :	('an', 'book-mk-st', self.default_func),
-        'bkmkstart'         :	('an', 'book-mk-st', self.default_func),
+        'bkmkstart'          :	('an', 'book-mk-st', self.default_func),
         'bkmkend'            :	('an', 'book-mk-en', self.default_func),
         'xe'                 :	('an', 'index-mark', self.default_func),
         'rxe'                :	('an', 'place_____', self.default_func),
@@ -347,7 +353,7 @@ class ProcessTokens:
             10:     'Kanji numbering without the digit character',
             11:     'Kanji numbering with the digit character',
             1246:   'phonetic Katakana characters in aiueo order',
-            1346:    'phonetic katakana characters in iroha order',
+            1346:   'phonetic katakana characters in iroha order',
             14:     'double byte character',
             15:     'single byte character',
             16:     'Kanji numbering 3',
@@ -392,7 +398,7 @@ class ProcessTokens:
             5121 	:  'Arabic Algeria',
             15361 	:  'Arabic Bahrain',
             3073 	:  'Arabic Egypt',
-            1 	        :   'Arabic General',
+            1 	    :   'Arabic General',
             2049 	:  'Arabic Iraq',
             11265 	:  'Arabic Jordan',
             13313 	:  'Arabic Kuwait',
@@ -417,7 +423,7 @@ class ProcessTokens:
             1059 	:  'Byelorussian',
             1027 	:  'Catalan',
             2052 	:  'Chinese China',
-            4 	        :  'Chinese General',
+            4 	    :  'Chinese General',
             3076 	:  'Chinese Hong Kong',
             4100 	:  'Chinese Singapore',
             1028 	:  'Chinese Taiwan',
@@ -431,7 +437,7 @@ class ProcessTokens:
             2057 	:  'English British',
             4105 	:  'English Canada',
             9225 	:  'English Caribbean',
-            9 	        :  'English General',
+            9 	    :  'English General',
             6153 	:  'English Ireland',
             8201 	:  'English Jamaica',
             5129 	:  'English New Zealand',
@@ -595,30 +601,37 @@ class ProcessTokens:
         num = num[1:] # chop off leading 0, which I added
         num = num.upper() # the mappings store hex in caps
         return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
+
     def ms_sub_func(self, pre, token, num):
         return 'tx<mc<__________<%s\n' % token
+
+    def hardline_func(self, pre, token, num):
+        return 'mi<tg<empty_____<%s\n' % token
+
     def default_func(self, pre, token, num):
-        if num == None:
+        if num is None:
             num = 'true'
         return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+
     def __list_type_func(self, pre, token, num):
         type = 'arabic'
-        if num == None:
+        if num is None:
             type = 'Arabic'
         else:
             try:
                 num = int(num)
             except ValueError:
                 if self.__run_level > 3:
-                    msg = 'number "%s" cannot be converted to integer\n' % num
+                    msg = 'Number "%s" cannot be converted to integer\n' % num
                     raise self.__bug_handler, msg
             type = self.__number_type_dict.get(num)
-            if type == None:
+            if type is None:
                 if self.__run_level > 3:
                     msg = 'No type for "%s" in self.__number_type_dict\n'
                     raise self.__bug_handler
                 type = 'Arabic'
         return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
+
     def __language_func(self, pre, token, num):
         lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
         if not lang_name:
@@ -627,31 +640,36 @@ class ProcessTokens:
                 msg = 'No entry for number "%s"' % num
                 raise self.__bug_handler, msg
         return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
+
     def two_part_func(self, pre, token, num):
         list = token.split("<")
         token = list[0]
         num = list[1]
         return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
         ##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
+
     def divide_by_2(self, pre, token, num):
         num = self.divide_num(num, 2)
         return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
         ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
     def divide_by_20(self, pre, token, num):
         num = self.divide_num(num, 20)
         return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
         ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
     def text_func(self, pre, token, num=None):
         return 'tx<nu<__________<%s\n' % token
+
     def ob_func(self, pre, token, num=None):
         self.__bracket_count += 1
-        ##return 'ob<%04d\n' % self.__bracket_count
         return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
+
     def cb_func(self, pre, token, num=None):
-        ##line = 'cb<%04d\n' % self.__bracket_count
         line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
         self.__bracket_count -= 1
         return line
+
     def color_func(self, pre, token, num):
         third_field = 'nu'
         if num[-1] == ';':
@@ -662,6 +680,7 @@ class ProcessTokens:
             num = "0" + num
         return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
         ##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
+
     def bool_st_func(self, pre, token, num):
         if num is None or num == '' or num == '1':
             return 'cw<%s<%s<nu<true\n' % (pre, token)
@@ -670,24 +689,23 @@ class ProcessTokens:
             return 'cw<%s<%s<nu<false\n' % (pre, token)
                 ##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
         else:
-            msg = 'boolean should have some value module process tokens\n'
-            msg += 'token is ' + token + "\n"
-            msg += "'" + num + "'" + "\n"
+            msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
             raise self.__bug_handler, msg
+
     def __no_sup_sub_func(self, pre, token, num):
         the_string = 'cw<ci<subscript_<nu<false\n'
         the_string += 'cw<ci<superscrip<nu<false\n'
         return the_string
+
     def divide_num(self, numerator, denominator):
         try:
-            numerator = float(re.search('[0-9.]+', numerator).group())            
+            #calibre why ignore negative number? Wrong in case of \fi
+            numerator = float(re.search('[0-9.\-]+', numerator).group())
         except TypeError, msg:
             if self.__run_level > 3:
-                msg = 'no number to process?\n'
-                msg += 'this indicates that the token '
-                msg += ' \(\\li\) should have a number and does not\n'
-                msg += 'numerator is "%s"\n' % numerator
-                msg += 'denominator is "%s"\n' % denominator
+                msg = ('No number to process?\nthis indicates that the token \(\\li\) \
+                should have a number and does not\nnumerator is \
+                "%s"\ndenominator is "%s"\n') % (numerator, denominator)
                 raise self.__bug_handler, msg
             if 5 > self.__return_code:
                 self.__return_code = 5
@@ -698,9 +716,10 @@ class ProcessTokens:
         if string_num[-2:] == ".0":
             string_num = string_num[:-2]
         return string_num
+
     def split_let_num(self, token):
         match_obj = re.search(self.__num_exp,token)
-        if match_obj != None:
+        if match_obj is not None:
             first = match_obj.group(1)
             second = match_obj.group(2)
             if not second:
@@ -714,6 +733,7 @@ class ProcessTokens:
                 raise self.__bug_handler
             return token, 0
         return first, second
+
     def convert_to_hex(self,number):
         """Convert a string to uppercase hexidecimal"""
         num = int(number)
@@ -722,6 +742,7 @@ class ProcessTokens:
             return hex_num
         except:
             raise self.__bug_handler
+
     def process_cw(self, token):
         """Change the value of the control word by determining what dictionary
         it belongs to"""
@@ -737,89 +758,62 @@ class ProcessTokens:
         pre, token, action = self.dict_token.get(token, (None, None, None))
         if action:
             return action(pre, token, num)
-    # unused function
-    def initiate_token_actions(self):
-        self.action_for_token={
-        '{'     :   self.ob_func,
-        '}'     :   self.cb_func,
-        '\\'    :   self.process_cw,
-        }
-    # unused function
-    def evaluate_token(self,token):
-        """Evaluate tokens. Return a value if the token is not a
-        control word. Otherwise, pass token onto another method
-        for further evaluation."""
-        token, action = self.dict_token.get(token[0:1])
-        if action:
-            line = action(token)
-            return line
-        else :
-            return  'tx<nu<nu<nu<nu<%s\n' % token
+
     def __check_brackets(self, in_file):
         self.__check_brack_obj = check_brackets.CheckBrackets\
             (file = in_file)
         good_br =  self.__check_brack_obj.check_brackets()[0]
         if not good_br:
             return 1
+
     def process_tokens(self):
         """Main method for handling other methods. """
-        first_token = 0
-        second_token = 0
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = "dummy"
         line_count = 0
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            token = line_to_read
-            token = token.replace("\n","")
-            if not token:
-                continue
-            line_count += 1
-            try:
-                token.decode('us-ascii')
-            except UnicodeError, msg:
-                msg = str(msg)
-                msg += 'Invalid RTF: File not ascii encoded.\n'
-                raise self.__exception_handler, msg
-            if not first_token:
-                if token != '\\{':
-                    msg = 'Invalid RTF: document doesn\'t start with {\n'
-                    raise self.__exception_handler, msg
-                first_token = 1
-            elif first_token and not second_token:
-                if token[0:4] != '\\rtf':
-                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
-                    raise self.__exception_handler, msg
-                second_token = 1
-            ##token = self.evaluate_token(token)
-            the_index = token.find('\\ ')
-            if token != None and  the_index > -1:
-                msg ='Invalid RTF: token "\\ " not valid. \n'
-                raise self.__exception_handler, msg
-            elif token[0:1] == "\\":
-                line = self.process_cw(token)
-                if line != None:
-                    write_obj.write(line)
-            else:
-                fields = re.split(self.__utf_exp, token)
-                for field in fields:
-                    if not field:
-                        continue
-                    if field[0:1] == '&':
-                        write_obj.write('tx<ut<__________<%s\n' % field)
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'wb') as write_obj:
+                for line in read_obj:
+                    token = line.replace("\n","")
+                    line_count += 1
+                    if line_count == 1 and token != '\\{':
+                            msg = 'Invalid RTF: document doesn\'t start with {\n'
+                            raise self.__exception_handler, msg
+                    elif line_count == 2 and token[0:4] != '\\rtf':
+                            msg = 'Invalid RTF: document doesn\'t start with \\rtf \n'
+                            raise self.__exception_handler, msg
+
+                    the_index = token.find('\\ ')
+                    if token is not None and  the_index > -1:
+                        msg = 'Invalid RTF: token "\\ " not valid.\n'
+                        raise self.__exception_handler, msg
+                    elif token[:1] == "\\":
+                        try:
+                            token.decode('us-ascii')
+                        except UnicodeError, msg:
+                            msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg)
+                            raise self.__exception_handler, msg
+                        line = self.process_cw(token)
+                        if line is not None:
+                            write_obj.write(line)
                     else:
-                        write_obj.write('tx<nu<__________<%s\n' % field)
-        read_obj.close()
-        write_obj.close()
+                        fields = re.split(self.__utf_exp, token)
+                        for field in fields:
+                            if not field:
+                                continue
+                            if field[0:1] == '&':
+                                write_obj.write('tx<ut<__________<%s\n' % field)
+                            else:
+                                write_obj.write('tx<nu<__________<%s\n' % field)
+
         if not line_count:
-            msg ='Invalid RTF: file appears to be empty. \n'
+            msg = 'Invalid RTF: file appears to be empty.\n'
             raise self.__exception_handler, msg
+
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "processed_tokens.data")
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
+
         bad_brackets = self.__check_brackets(self.__file)
         if bad_brackets:
             msg = 'Invalid RTF: document does not have matching brackets.\n'
diff --git a/src/calibre/ebooks/rtf2xml/replace_illegals.py b/src/calibre/ebooks/rtf2xml/replace_illegals.py
index 901cdd289d..4b477087d4 100755
--- a/src/calibre/ebooks/rtf2xml/replace_illegals.py
+++ b/src/calibre/ebooks/rtf2xml/replace_illegals.py
@@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.cleantext import clean_ascii_chars
+
 class ReplaceIllegals:
     """
     reaplace illegal lower ascii characters
@@ -30,21 +33,14 @@ class ReplaceIllegals:
         self.__copy = copy
         self.__run_level = run_level
         self.__write_to = tempfile.mktemp()
+
     def replace_illegals(self):
         """
         """
-        nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  13, 14, 15, 16, 17, 18, 19]
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            for num in nums:
-                line = line.replace(chr(num), '')
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    write_obj.write(clean_ascii_chars(line))
         copy_obj = copy.Copy()
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "replace_illegals.data")
diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py
index 45887f33e7..de66415f0c 100755
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.mreplace import MReplace
+
 class Tokenize:
     """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
     def __init__(self,
@@ -28,89 +31,175 @@ class Tokenize:
         self.__file = in_file
         self.__bug_handler = bug_handler
         self.__copy = copy
-        self.__special_tokens = [ '_', '~', "'", '{', '}' ]
         self.__write_to = tempfile.mktemp()
-    def __from_ms_to_utf8(self,match_obj):
-        uni_char = int(match_obj.group(1))
-        if uni_char < 0:
-            uni_char +=  65536
-        return   '&#x' + str('%X' % uni_char) + ';'
-    def __neg_unicode_func(self, match_obj):
-        neg_uni_char = int(match_obj.group(1)) * -1
-        # sys.stderr.write(str( neg_uni_char))
-        uni_char = neg_uni_char + 65536
-        return   '&#x' + str('%X' % uni_char) + ';'
-    def __sub_line_reg(self,line):
-        line = line.replace("\\\\", "\\backslash ")
-        line = line.replace("\\~", "\\~ ")
-        line = line.replace("\\;", "\\; ")
-        line = line.replace("&", "&amp;")
-        line = line.replace("<", "&lt;")
-        line = line.replace(">", "&gt;")
-        line = line.replace("\\~", "\\~ ")
-        line = line.replace("\\_", "\\_ ")
-        line = line.replace("\\:", "\\: ")
-        line = line.replace("\\-", "\\- ")
-        # turn into a generic token to eliminate special
-        # cases and make processing easier
-        line = line.replace("\\{", "\\ob ")
-        # turn into a generic token to eliminate special
-        # cases and make processing easier
-        line = line.replace("\\}", "\\cb ")
-        # put a backslash in front of to eliminate special cases and
-        # make processing easier
-        line = line.replace("{", "\\{")
-        # put a backslash in front of to eliminate special cases and
-        # make processing easier
-        line = line.replace("}", "\\}")
-        line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
-        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
-        line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
-        ##line = line.replace("\\backslash", "\\\\")
-        # this is for older RTF
-        line = re.sub(self.__par_exp, '\\par ', line)
-        return line
-    def __compile_expressions(self):
-        self.__ms_hex_exp = re.compile(r"\\\'(..)")
-        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
-        self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
-        self.__par_exp = re.compile(r'\\$')
-        self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
-        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
-    def __create_tokens(self):
         self.__compile_expressions()
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = "dummy"
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            line = line.replace("\n", "")
-            line =  self.__sub_line_reg(line)
-            tokens = re.split(self.__splitexp, line)
-            ##print tokens
-            for token in tokens:
-                if token != "":
-                    write_obj.write(token + "\n")
-                    """
-                    match_obj = re.search(self.__mixed_exp, token)
-                    if match_obj != None:
-                        first = match_obj.group(1)
-                        second = match_obj.group(2)
-                        write_obj.write(first + "\n")
-                        write_obj.write(second + "\n")
-                    else:
-                        write_obj.write(token + "\n")
-                    """
-        read_obj.close()
-        write_obj.close()
+        #variables
+        self.__uc_char = 0
+        self.__uc_bin = False
+        self.__uc_value = [1]
+
+    def __reini_utf8_counters(self):
+        self.__uc_char = 0
+        self.__uc_bin = False
+
+    def __remove_uc_chars(self, startchar, token):
+        for i in xrange(startchar, len(token)):
+            if token[i] == " ":
+                continue
+            elif self.__uc_char:
+                self.__uc_char -= 1
+            else:
+                return token[i:]
+        #if only " " and char to skip
+        return ''
+
+    def __unicode_process(self, token):
+        #change scope in
+        if token == '\{':
+            self.__uc_value.append(self.__uc_value[-1])
+            #basic error handling
+            self.__reini_utf8_counters()
+            return token
+        #change scope out
+        elif token == '\}':
+            self.__uc_value.pop()
+            self.__reini_utf8_counters()
+            return token
+        #add a uc control
+        elif token[:3] == '\uc':
+            self.__uc_value[-1] = int(token[3:])
+            self.__reini_utf8_counters()
+            return token
+        #bin data to slip
+        elif self.__uc_bin:
+            self.__uc_bin = False
+            return ''
+        #uc char to remove
+        elif self.__uc_char:
+            #handle \bin tag in case of uc char to skip
+            if token[:4] == '\bin':
+                self.__uc_char -=1
+                self.__uc_bin = True
+                return ''
+            elif token[:1] == "\\" :
+                self.__uc_char -=1
+                return ''
+            else:
+                return self.__remove_uc_chars(0, token)
+        #go for real \u token
+        match_obj = self.__utf_exp.match(token)
+        if match_obj is not None:
+            self.__reini_utf8_counters()
+            #get value and handle negative case
+            uni_char = int(match_obj.group(1))
+            uni_len = len(match_obj.group(1)) + 2
+            if uni_char < 0:
+                uni_char += 65536
+            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            self.__uc_char = self.__uc_value[-1]
+            #there is only an unicode char
+            if len(token)<= uni_len:
+                return uni_char
+            #an unicode char and something else
+            #must be after as it is splited on \
+            #necessary? maybe for \bin?
+            elif not self.__uc_char:
+                return uni_char + token[uni_len:]
+            #if not uc0 and chars
+            else:
+                return uni_char + self.__remove_uc_chars(uni_len, token)
+        #default
+        return token
+
+    def __sub_reg_split(self,input_file):
+        input_file = self.__replace_spchar.mreplace(input_file)
+        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
+        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
+        #remove \n in bin data
+        input_file = self.__bin_exp.sub(lambda x: \
+                                        x.group().replace('\n', '') + '\n', input_file)
+        #split
+        tokens = re.split(self.__splitexp, input_file)
+        #remove empty tokens and \n
+        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
+        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
+        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
+        # this is for older RTF
+        #line = re.sub(self.__par_exp, '\\par ', line)
+        #return filter(lambda x: len(x) > 0, \
+            #(self.__remove_line.sub('', x) for x in tokens))
+
+    def __compile_expressions(self):
+        SIMPLE_RPL = {
+            "\\\\": "\\backslash ",
+            "\\~": "\\~ ",
+            "\\;": "\\; ",
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            "\\~": "\\~ ",
+            "\\_": "\\_ ",
+            "\\:": "\\: ",
+            "\\-": "\\- ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\{": "\\ob ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\}": "\\cb ",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "{": "\\{",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "}": "\\}",
+            # this is for older RTF
+            r'\\$': '\\par ',
+            }
+        self.__replace_spchar = MReplace(SIMPLE_RPL)
+        #add ;? in case of char following \u
+        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
+        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
+        self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
+        #manage upr/ud situations
+        self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
+                       r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
+        #add \n in split for whole file reading
+        #why keep backslash whereas \is replaced before?
+        #remove \n from endline char
+        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
+        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
+        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
+        #self.__par_exp = re.compile(r'\\$')
+        #self.__remove_line = re.compile(r'\n+')
+        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
+        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
+
     def tokenize(self):
-        """Main class for handling other methods. Reads in one line \
-        at a time, usues method self.sub_line to make basic substitutions,\
-        uses ? to process tokens"""
-        self.__create_tokens()
+        """Main class for handling other methods. Reads the file \
+        , uses method self.sub_reg to make basic substitutions,\
+        and process tokens by itself"""
+        #read
+        with open(self.__file, 'r') as read_obj:
+            input_file = read_obj.read()
+        
+        #process simple replacements and split giving us a correct list
+        #remove '' and \n in the process
+        tokens = self.__sub_reg_split(input_file)
+        #correct unicode
+        tokens = map(self.__unicode_process, tokens)
+        #remove empty items created by removing \uc
+        tokens = filter(lambda x: len(x) > 0, tokens)
+        
+        #write
+        with open(self.__write_to, 'wb') as write_obj:
+            write_obj.write('\n'.join(tokens))
+        #Move and copy
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "tokenize.data")
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)
+        
+        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
\ No newline at end of file