Remove unicode preprocessing of RTF files & correct rtftoxml

Slight modification in rtftoxml
2025-07-09 03:04:10 -04:00 · 2011-01-05 00:21:32 +01:00 · 2011-01-05 00:21:32 +01:00 · 5ca4d81071
commit 5ca4d81071
parent 6c26293345 26a8583887
8 changed files with 349 additions and 291 deletions
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -198,21 +198,21 @@ class RTFInput(InputFormatPlugin):
        with open('styles.css', 'ab') as f:
            f.write(css)
-    def preprocess(self, fname):
+    # def preprocess(self, fname):
-        self.log('\tPreprocessing to convert unicode characters')
+        # self.log('\tPreprocessing to convert unicode characters')
-        try:
+        # try:
-            data = open(fname, 'rb').read()
+            # data = open(fname, 'rb').read()
-            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+            # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
-            tokenizer = RtfTokenizer(data)
+            # tokenizer = RtfTokenizer(data)
-            tokens = RtfTokenParser(tokenizer.tokens)
+            # tokens = RtfTokenParser(tokenizer.tokens)
-            data = tokens.toRTF()
+            # data = tokens.toRTF()
-            fname = 'preprocessed.rtf'
+            # fname = 'preprocessed.rtf'
-            with open(fname, 'wb') as f:
+            # with open(fname, 'wb') as f:
-                f.write(data)
+                # f.write(data)
-        except:
+        # except:
-            self.log.exception(
+            # self.log.exception(
-            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+            # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
-        return fname
+        # return fname
    def convert_borders(self, doc):
        border_styles = []
@ -249,9 +249,9 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        # fname = self.preprocess(stream.name)
        try:
-            xml = self.generate_xml(fname)
+            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -18,6 +18,7 @@
 # $Revision: 1.41 $
 # $Date: 2006/03/24 23:50:07 $
 import sys, os
 from calibre.ebooks.rtf2xml import headings_to_sections, \
    line_endings, footnote, fields_small, default_encoding, \
    make_lists, preamble_div, header, colors, group_borders, \
@ -90,7 +91,6 @@ class ParseRtf:
                out_file = '',
                out_dir = None,
                dtd = '',
                #debug = 0, #why? calibre
                deb_dir = None,
                convert_symbol = None,
                convert_wingdings = None,
@ -107,6 +107,7 @@ class ParseRtf:
                no_dtd = 0,
                char_data = '',
                ):
        """
        Requires:
        'file' --file to parse
@ -119,12 +120,11 @@ class ParseRtf:
            script tries to output to directory where is script is exectued.)
            'deb_dir' --debug directory. If a debug_dir is provided, the script
            will copy each run through as a file to examine in the debug_dir
            'perl_script'--use perl to make tokens. This runs just a bit faster.
            (I will probably phase this out.)
            'check_brackets' -- make sure the brackets match up after each run
            through a file. Only for debugging.
        Returns: Nothing
        """
        self.__file = in_file
        self.__out_file = out_file
        self.__out_dir = out_dir
@ -132,7 +132,7 @@ class ParseRtf:
        self.__dtd_path = dtd
        self.__check_file(in_file,"file_to_parse")
        self.__char_data = char_data
-        self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
+        self.__debug_dir = deb_dir
        self.__check_dir(self.__temp_dir)
        self.__copy = self.__check_dir(self.__debug_dir)
        self.__convert_caps = convert_caps
@ -155,25 +155,24 @@ class ParseRtf:
        if hasattr(the_file, 'read'): return
        if the_file == None:
            if type == "file_to_parse":
-                message = "You must provide a file for the script to work"
+                msg = _("\nYou must provide a file for the script to work")
            msg = message
            raise RtfInvalidCodeException, msg
        elif os.path.exists(the_file):
            pass # do nothing
        else:
-            message = "The file '%s' cannot be found" % the_file
+            msg = _("\nThe file '%s' cannot be found") % the_file
            msg = message
            raise RtfInvalidCodeException, msg
    def __check_dir(self, the_dir):
        """Check to see if directory exists"""
        if not the_dir :
            return
        dir_exists = os.path.isdir(the_dir)
        if not dir_exists:
-            message = "%s is not a directory" % the_dir
+            msg = _("\n%s is not a directory") % the_dir
            msg = message
            raise RtfInvalidCodeException, msg
        return 1
    def parse_rtf(self):
        """
        Parse the file by calling on other classes.
@ -194,13 +193,14 @@ class ParseRtf:
            copy_obj.set_dir(self.__debug_dir)
            copy_obj.remove_files()
            copy_obj.copy_file(self.__temp_file, "original_file")
-        # new as of 2005-08-02. Do I want this?
+        # Function to check if bracket are well handled
        if self.__debug_dir or self.__run_level > 2:
            self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                    )
-        # convert Macintosh line endings to Unix line endings
+        # convert Macintosh and Windows line endings to Unix line endings
        #why do this if you don't wb after?
        line_obj = line_endings.FixLineEndings(
                in_file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
@ -208,13 +208,13 @@ class ParseRtf:
                run_level = self.__run_level,
                replace_illegals = self.__replace_illegals,
                )
-        return_value = line_obj.fix_endings()
+        return_value = line_obj.fix_endings() #calibre return what?
        self.__return_code(return_value)
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
                in_file = self.__temp_file,
                copy = self.__copy,
-                run_level = self.__run_level,)
+                run_level = self.__run_level)
        tokenize_obj.tokenize()
        process_tokens_obj = process_tokens.ProcessTokens(
            in_file = self.__temp_file,
@ -230,11 +230,13 @@ class ParseRtf:
                os.remove(self.__temp_file)
            except OSError:
                pass
            #Check to see if the file is correct ascii
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            check_encoding_obj.check_encoding(self.__file)
+            if check_encoding_obj.check_encoding(self.__file):
-            sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
+                sys.stderr.write(_('File "%s" does not appear to be ascii.\n') \
                     % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
@ -370,10 +372,10 @@ class ParseRtf:
                sys.stderr.write('File could be older RTF...\n')
            if found_destination:
                if self.__run_level > 1:
-                    sys.stderr.write(
+                    sys.stderr.write(_(
                        'File also has newer RTF.\n'
                        'Will do the best to convert.\n'
-                    )
+                    ))
            add_brackets_obj = add_brackets.AddBrackets(
                    in_file = self.__temp_file,
                    bug_handler = RtfInvalidCodeException,
@ -520,6 +522,7 @@ class ParseRtf:
        output_obj.output()
        os.remove(self.__temp_file)
        return self.__exit_level
    def __bracket_match(self, file_name):
        if self.__run_level > 2:
            good_br, msg =  self.__check_brack_obj.check_brackets()
@ -527,28 +530,20 @@ class ParseRtf:
                pass
                #sys.stderr.write( msg + ' in ' + file_name + "\n")
            else:
-                msg += msg +  " in file '" + file_name + "'\n"
+                msg = _('%s in file %s\n') % (msg, file_name)
                raise RtfInvalidCodeException, msg
    def __return_code(self, num):
      if num == None:
          return
      if int(num) > self.__exit_level:
          self.__exit_level = num
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
-        write_obj = open(write_file, 'w')
+        with open(write_file, 'wb') as write_obj:
-        line = "dummy"
+            for line in read_obj:
        while line:
            line = read_obj.read(1000)
                write_obj.write(line)
        write_obj.close()
        return write_file
    """
 mi<tg<open______<style-sheet\n
 mi<tg<close_____<style-sheet\n
 mi<tg<open-att__<footnote<num>1\n
 mi<tg<empty-att_<page-definition<margin>33\n
 mi<tg<empty_____<para\n
 """
--- a/src/calibre/ebooks/rtf2xml/check_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/check_brackets.py
@ -24,38 +24,37 @@ class CheckBrackets:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__open_bracket_num = []
    def open_brack(self, line):
        num = line[-5:-1]
        self.__open_bracket_num.append(num)
        self.__bracket_count += 1
    def close_brack(self, line):
        num = line[-5:-1]
        ##self.__open_bracket_num.append(num)
        try:
            last_num = self.__open_bracket_num.pop()
        except:
-            return 0
+            return False
        if num != last_num:
-            return 0
+            return False
        self.__bracket_count -= 1
-        return 1
+        return True
    def check_brackets(self):
        read_obj = open(self.__file, 'r')
        line = 'dummy'
        line_count = 0
-        while line:
+        with open(self.__file, 'r') as read_obj:
            for line in read_obj:
                line_count += 1
            line = read_obj.readline()
                self.__token_info = line[:16]
                if self.__token_info == 'ob<nu<open-brack':
                    self.open_brack(line)
                if self.__token_info == 'cb<nu<clos-brack':
-                right_count = self.close_brack(line)
+                    if not self.close_brack(line):
-                if not right_count:
+                        return (False, "closed bracket doesn't match, line %s" % line_count)
-                    return (0, "closed bracket doesn't match, line %s" % line_count)
+
        read_obj.close()
        if self.__bracket_count != 0:
-            msg = 'At end of file open and closed brackets don\'t match\n'
+            msg = _('At end of file open and closed brackets don\'t match\n' \
-            msg = msg + 'total number of brackets is %s' % self.__bracket_count
+                        'total number of brackets is %s') % self.__bracket_count
-            return (0, msg)
+            return (False, msg)
-        return (1, "brackets match!")
+        return (True, _("Brackets match!"))
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -1,8 +1,10 @@
 #!/usr/bin/env python
 import sys
 class CheckEncoding:
    def __init__(self, bug_handler):
        self.__bug_handler = bug_handler
    def __get_position_error(self, line, encoding, line_num):
        char_position = 0
        for char in line:
@ -10,23 +12,24 @@ class CheckEncoding:
            try:
                char.decode(encoding)
            except UnicodeError, msg:
-                sys.stderr.write('line: %s char: %s\n' %  (line_num, char_position))
+                sys.stderr.write(_('line: %s char: %s\n') %  (line_num, char_position))
                sys.stderr.write(str(msg) + '\n')
    def check_encoding(self, path, encoding='us-ascii'):
        read_obj = open(path, 'r')
        line_to_read = 1
        line_num = 0
-        while line_to_read:
+        with open(path, 'r') as read_obj:
            for line in read_obj:
                line_num += 1
            line_to_read = read_obj.readline()
            line = line_to_read
                try:
                    line.decode(encoding)
                except UnicodeError:
                    if len(line) < 1000:
                        self.__get_position_error(line, encoding, line_num)
                    else:
-                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
+                        sys.stderr.write(_('line: %d has bad encoding\n') % line_num)
                    return True
        return False
 if __name__ == '__main__':
    check_encoding_obj = CheckEncoding()
    check_encoding_obj.check_encoding(sys.argv[1])
--- a/src/calibre/ebooks/rtf2xml/copy.py
+++ b/src/calibre/ebooks/rtf2xml/copy.py
@ -23,6 +23,7 @@ class Copy:
    def __init__(self, bug_handler, file = None, deb_dir = None, ):
        self.__file = file
        self.__bug_handler = bug_handler
    def set_dir(self, deb_dir):
        """Set the temporary directory to write files to"""
        if deb_dir is None:
@ -33,19 +34,11 @@ class Copy:
            message = "%(deb_dir)s is not a directory" % vars()
            raise self.__bug_handler , message
        Copy.__dir = deb_dir
    def remove_files(self ):
        """Remove files from directory"""
        self.__remove_the_files(Copy.__dir)
-        """
+
        list_of_files = os.listdir(Copy.__dir)
        list_of_files = os.listdir(the_dir)
        for file in list_of_files:
            rem_file = os.path.join(Copy.__dir,file)
            if os.path.isdir(rem_file):
                self.remove_files(rem_file)
            else:
                os.remove(rem_file)
        """
    def __remove_the_files(self, the_dir):
        """Remove files from directory"""
        list_of_files = os.listdir(the_dir)
@ -58,6 +51,7 @@ class Copy:
                    os.remove(rem_file)
                except OSError:
                    pass
    def copy_file(self, file, new_file):
        """
        Copy the file to a new name
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, tempfile, re
 from calibre.ebooks.rtf2xml import copy
 from calibre.utils.cleantext import clean_ascii_chars
 class FixLineEndings:
    """Fix line endings"""
    def __init__(self,
@ -32,34 +35,21 @@ class FixLineEndings:
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
    def fix_endings(self):
-        ##tempFileName = tempfile.mktemp()
+        #read
-        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
+        with open(self.__file, 'r') as read_obj:
-        #nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  14, 15, 16, 17, 18, 19]
+            input_file = read_obj.read()
-        """
+        #calibre go from win and mac to unix
-read_obj = open(self.__file, 'r')
+        input_file = input_file.replace ('\r\n', '\n')
-line = read_obj.read(1000)
+        input_file = input_file.replace ('\r', '\n')
-regexp = re.compile(r"\r")
+        #remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
 macintosh = regexp.search(line)
 read_obj.close()
        """
        # always check since I have to get rid of illegal characters
        macintosh = 1
        if macintosh:
            line = 1
            read_obj = open(self.__file, 'r')
            write_obj = open(self.__write_to, 'w')
            while line:
                line = read_obj.read(1000)
                # line = re.sub(regexp,"\n",line)
                line = line.replace ('\r', '\n')
        if self.__replace_illegals:
-                    line = re.sub(illegal_regx, '', line)
+            input_file = clean_ascii_chars(input_file)
-                    # for num in nums:
+        #write
-                        # line = line.replace(chr(num), '')
+        with open(self.__write_to, 'wb') as write_obj:
-                write_obj.write(line )
+            write_obj.write(input_file)
-            read_obj.close()
+        #copy
            write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "line_endings.data")
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
 from calibre.ebooks.rtf2xml import copy, check_brackets
 class ProcessTokens:
    """
    Process each token on a line and add information that will be useful for
@ -41,9 +43,11 @@ class ProcessTokens:
        self.__bracket_count=0
        self.__exception_handler = exception_handler
        self.__bug_handler = bug_handler
    def compile_expressions(self):
        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
        self.__utf_exp = re.compile(r'(&.*?;)')
    def initiate_token_dict(self):
        self.__return_code = 0
        self.dict_token={
@ -595,12 +599,15 @@ class ProcessTokens:
        num = num[1:] # chop off leading 0, which I added
        num = num.upper() # the mappings store hex in caps
        return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token
    def default_func(self, pre, token, num):
        if num == None:
            num = 'true'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
    def __list_type_func(self, pre, token, num):
        type = 'arabic'
        if num == None:
@ -610,15 +617,16 @@ class ProcessTokens:
                num = int(num)
            except ValueError:
                if self.__run_level > 3:
-                    msg = 'number "%s" cannot be converted to integer\n' % num
+                    msg = _('Number "%s" cannot be converted to integer\n') % num
                    raise self.__bug_handler, msg
            type = self.__number_type_dict.get(num)
            if type == None:
                if self.__run_level > 3:
-                    msg = 'No type for "%s" in self.__number_type_dict\n'
+                    msg = _('No type for "%s" in self.__number_type_dict\n')
                    raise self.__bug_handler
                type = 'Arabic'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
    def __language_func(self, pre, token, num):
        lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
        if not lang_name:
@ -627,31 +635,36 @@ class ProcessTokens:
                msg = 'No entry for number "%s"' % num
                raise self.__bug_handler, msg
        return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
    def two_part_func(self, pre, token, num):
        list = token.split("<")
        token = list[0]
        num = list[1]
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
    def divide_by_2(self, pre, token, num):
        num = self.divide_num(num, 2)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
    def divide_by_20(self, pre, token, num):
        num = self.divide_num(num, 20)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
    def text_func(self, pre, token, num=None):
        return 'tx<nu<__________<%s\n' % token
    def ob_func(self, pre, token, num=None):
        self.__bracket_count += 1
        ##return 'ob<%04d\n' % self.__bracket_count
        return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
    def cb_func(self, pre, token, num=None):
        ##line = 'cb<%04d\n' % self.__bracket_count
        line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
        self.__bracket_count -= 1
        return line
    def color_func(self, pre, token, num):
        third_field = 'nu'
        if num[-1] == ';':
@ -662,6 +675,7 @@ class ProcessTokens:
            num = "0" + num
        return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
        ##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
    def bool_st_func(self, pre, token, num):
        if num is None or num == '' or num == '1':
            return 'cw<%s<%s<nu<true\n' % (pre, token)
@ -674,13 +688,16 @@ class ProcessTokens:
            msg += 'token is ' + token + "\n"
            msg += "'" + num + "'" + "\n"
            raise self.__bug_handler, msg
    def __no_sup_sub_func(self, pre, token, num):
        the_string = 'cw<ci<subscript_<nu<false\n'
        the_string += 'cw<ci<superscrip<nu<false\n'
        return the_string
    def divide_num(self, numerator, denominator):
        try:
-            numerator = float(re.search('[0-9.]+', numerator).group())            
+            #calibre why ignore negative number? Wrong in case of \fi
            numerator = float(re.search('[0-9.\-]+', numerator).group())
        except TypeError, msg:
            if self.__run_level > 3:
                msg = 'no number to process?\n'
@ -698,6 +715,7 @@ class ProcessTokens:
        if string_num[-2:] == ".0":
            string_num = string_num[:-2]
        return string_num
    def split_let_num(self, token):
        match_obj = re.search(self.__num_exp,token)
        if match_obj != None:
@ -714,6 +732,7 @@ class ProcessTokens:
                raise self.__bug_handler
            return token, 0
        return first, second
    def convert_to_hex(self,number):
        """Convert a string to uppercase hexidecimal"""
        num = int(number)
@ -722,6 +741,7 @@ class ProcessTokens:
            return hex_num
        except:
            raise self.__bug_handler
    def process_cw(self, token):
        """Change the value of the control word by determining what dictionary
        it belongs to"""
@ -737,69 +757,36 @@ class ProcessTokens:
        pre, token, action = self.dict_token.get(token, (None, None, None))
        if action:
            return action(pre, token, num)
-    # unused function
+    
    def initiate_token_actions(self):
        self.action_for_token={
        '{'     :   self.ob_func,
        '}'     :   self.cb_func,
        '\\'    :   self.process_cw,
        }
    # unused function
    def evaluate_token(self,token):
        """Evaluate tokens. Return a value if the token is not a
        control word. Otherwise, pass token onto another method
        for further evaluation."""
        token, action = self.dict_token.get(token[0:1])
        if action:
            line = action(token)
            return line
        else :
            return  'tx<nu<nu<nu<nu<%s\n' % token
    def __check_brackets(self, in_file):
        self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
        good_br =  self.__check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
    def process_tokens(self):
        """Main method for handling other methods. """
        first_token = 0
        second_token = 0
        read_obj = open(self.__file, 'r')
        write_obj = open(self.__write_to, 'w')
        line_to_read = "dummy"
        line_count = 0
-        while line_to_read:
+        with open(self.__file, 'r') as read_obj, open(self.__write_to, 'wb') as write_obj:
-            line_to_read = read_obj.readline()
+            for line in read_obj:
-            token = line_to_read
+                token = line.replace("\n","")
            token = token.replace("\n","")
            if not token:
                continue
                line_count += 1
-            try:
+                if line_count == 1 and token != '\\{':
-                token.decode('us-ascii')
+                        msg = _('Invalid RTF: document doesn\'t start with {\n')
            except UnicodeError, msg:
                msg = str(msg)
                msg += 'Invalid RTF: File not ascii encoded.\n'
                        raise self.__exception_handler, msg
-            if not first_token:
+                elif line_count == 2 and token[0:4] != '\\rtf':
-                if token != '\\{':
+                        msg =_('Invalid RTF: document doesn\'t start with \\rtf \n')
                    msg = 'Invalid RTF: document doesn\'t start with {\n'
                        raise self.__exception_handler, msg
-                first_token = 1
+                
            elif first_token and not second_token:
                if token[0:4] != '\\rtf':
                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
                    raise self.__exception_handler, msg
                second_token = 1
                ##token = self.evaluate_token(token)
                the_index = token.find('\\ ')
-            if token != None and  the_index > -1:
+                if token is not None and  the_index > -1:
                    msg ='Invalid RTF: token "\\ " not valid.\n'
                    raise self.__exception_handler, msg
-            elif token[0:1] == "\\":
+                elif token[:1] == "\\":
                    line = self.process_cw(token)
-                if line != None:
+                    if line is not None:
                        write_obj.write(line)
                else:
                    fields = re.split(self.__utf_exp, token)
@ -810,19 +797,20 @@ class ProcessTokens:
                            write_obj.write('tx<ut<__________<%s\n' % field)
                        else:
                            write_obj.write('tx<nu<__________<%s\n' % field)
-        read_obj.close()
+
        write_obj.close()
        if not line_count:
-            msg ='Invalid RTF: file appears to be empty. \n'
+            msg =_('Invalid RTF: file appears to be empty.\n')
            raise self.__exception_handler, msg
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
-            msg = 'Invalid RTF: document does not have matching brackets.\n'
+            msg = _('Invalid RTF: document does not have matching brackets.\n')
            raise self.__exception_handler, msg
        else:
            return self.__return_code
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
 from calibre.ebooks.rtf2xml import copy
 from calibre.utils.mreplace import MReplace
 class Tokenize:
    """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
    def __init__(self,
@ -28,89 +31,175 @@ class Tokenize:
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__special_tokens = [ '_', '~', "'", '{', '}' ]
        self.__write_to = tempfile.mktemp()
-    def __from_ms_to_utf8(self,match_obj):
+        self.__compile_expressions()
        #variables
        self.__uc_char = 0
        self.__uc_bin = False
        self.__uc_value = [1]
    def __reini_utf8_counters(self):
        self.__uc_char = 0
        self.__uc_bin = False
    def __remove_uc_chars(self, startchar, token):
        for i in xrange(startchar, len(token)):
            if token[i] == " ":
                continue
            elif self.__uc_char:
                self.__uc_char -= 1
            else:
                return token[i:]
        #if only " " and char to skip
        return ''
    def __unicode_process(self, token):
        #change scope in
        if token == '\{':
            self.__uc_value.append(self.__uc_value[-1])
            #basic error handling
            self.__reini_utf8_counters()
            return token
        #change scope out
        elif token == '\}':
            self.__uc_value.pop()
            self.__reini_utf8_counters()
            return token
        #add a uc control
        elif token[:3] == '\uc':
            self.__uc_value[-1] = int(token[3:])
            self.__reini_utf8_counters()
            return token
        #bin data to slip
        elif self.__uc_bin:
            self.__uc_bin = False
            return ''
        #uc char to remove
        elif self.__uc_char:
            #handle \bin tag in case of uc char to skip
            if token[:4] == '\bin':
                self.__uc_char -=1
                self.__uc_bin = True
                return ''
            elif token[:1] == "\\" :
                self.__uc_char -=1
                return ''
            else:
                return self.__remove_uc_chars(0, token)
        #go for real \u token
        match_obj = self.__utf_exp.match(token)
        if match_obj is not None:
            self.__reini_utf8_counters()
            #get value and handle negative case
            uni_char = int(match_obj.group(1))
            uni_len = len(match_obj.group(1)) + 2
            if uni_char < 0:
                uni_char += 65536
-        return   '&#x' + str('%X' % uni_char) + ';'
+            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
-    def __neg_unicode_func(self, match_obj):
+            self.__uc_char = self.__uc_value[-1]
-        neg_uni_char = int(match_obj.group(1)) * -1
+            #there is only an unicode char
-        # sys.stderr.write(str( neg_uni_char))
+            if len(token)<= uni_len:
-        uni_char = neg_uni_char + 65536
+                return uni_char
-        return   '&#x' + str('%X' % uni_char) + ';'
+            #an unicode char and something else
-    def __sub_line_reg(self,line):
+            #must be after as it is splited on \
-        line = line.replace("\\\\", "\\backslash ")
+            #necessary? maybe for \bin?
-        line = line.replace("\\~", "\\~ ")
+            elif not self.__uc_char:
-        line = line.replace("\\;", "\\; ")
+                return uni_char + token[uni_len:]
-        line = line.replace("&", "&amp;")
+            #if not uc0 and chars
        line = line.replace("<", "&lt;")
        line = line.replace(">", "&gt;")
        line = line.replace("\\~", "\\~ ")
        line = line.replace("\\_", "\\_ ")
        line = line.replace("\\:", "\\: ")
        line = line.replace("\\-", "\\- ")
        # turn into a generic token to eliminate special
        # cases and make processing easier
        line = line.replace("\\{", "\\ob ")
        # turn into a generic token to eliminate special
        # cases and make processing easier
        line = line.replace("\\}", "\\cb ")
        # put a backslash in front of to eliminate special cases and
        # make processing easier
        line = line.replace("{", "\\{")
        # put a backslash in front of to eliminate special cases and
        # make processing easier
        line = line.replace("}", "\\}")
        line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
        line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
        ##line = line.replace("\\backslash", "\\\\")
        # this is for older RTF
        line = re.sub(self.__par_exp, '\\par ', line)
        return line
    def __compile_expressions(self):
        self.__ms_hex_exp = re.compile(r"\\\'(..)")
        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
        self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
        self.__par_exp = re.compile(r'\\$')
        self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
    def __create_tokens(self):
        self.__compile_expressions()
        read_obj = open(self.__file, 'r')
        write_obj = open(self.__write_to, 'w')
        line_to_read = "dummy"
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            line = line.replace("\n", "")
            line =  self.__sub_line_reg(line)
            tokens = re.split(self.__splitexp, line)
            ##print tokens
            for token in tokens:
                if token != "":
                    write_obj.write(token + "\n")
                    """
                    match_obj = re.search(self.__mixed_exp, token)
                    if match_obj != None:
                        first = match_obj.group(1)
                        second = match_obj.group(2)
                        write_obj.write(first + "\n")
                        write_obj.write(second + "\n")
            else:
-                        write_obj.write(token + "\n")
+                return uni_char + self.__remove_uc_chars(uni_len, token)
-                    """
+        #default
-        read_obj.close()
+        return token
-        write_obj.close()
+
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
        input_file = self.__bin_exp.sub(lambda x: \
                                        x.group().replace('\n', '') +'\n', input_file)
        #split
        tokens = re.split(self.__splitexp, input_file)
        #remove empty tokens and \n
        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
        # this is for older RTF
        #line = re.sub(self.__par_exp, '\\par ', line)
        #return filter(lambda x: len(x) > 0, \
            #(self.__remove_line.sub('', x) for x in tokens))
    def __compile_expressions(self):
        SIMPLE_RPL = {
            "\\\\": "\\backslash ",
            "\\~": "\\~ ",
            "\\;": "\\; ",
            "&": "&amp;",
            "<": "&lt;",
            ">": "&gt;",
            "\\~": "\\~ ",
            "\\_": "\\_ ",
            "\\:": "\\: ",
            "\\-": "\\- ",
            # turn into a generic token to eliminate special
            # cases and make processing easier
            "\\{": "\\ob ",
            # turn into a generic token to eliminate special
            # cases and make processing easier
            "\\}": "\\cb ",
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "{": "\\{",
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
            # this is for older RTF
            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
        self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
        #manage upr/ud situations
        self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
                       r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
        #add \n in split for whole file reading
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
    def tokenize(self):
-        """Main class for handling other methods. Reads in one line \
+        """Main class for handling other methods. Reads the file \
-        at a time, usues method self.sub_line to make basic substitutions,\
+        , uses method self.sub_reg to make basic substitutions,\
-        uses ? to process tokens"""
+        and process tokens by itself"""
-        self.__create_tokens()
+        #read
        with open(self.__file, 'r') as read_obj:
            input_file = read_obj.read()
        #process simple replacements and split giving us a correct list
        #remove '' and \n in the process
        tokens = self.__sub_reg_split(input_file)
        #correct unicode
        tokens = map(self.__unicode_process, tokens)
        #remove empty items created by removing \uc
        tokens = filter(lambda x: len(x) > 0, tokens)
        #write
        with open(self.__write_to, 'wb') as write_obj:
            write_obj.write('\n'.join(tokens))
        #Move and copy
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]