Correct handling of \ as \par for old RTF

2025-07-09 03:04:10 -04:00 · 2011-01-15 13:21:13 +01:00 · 2011-01-15 13:21:13 +01:00 · 10c2e603e2
commit 10c2e603e2
parent c8810ce522
5 changed files with 36 additions and 14 deletions
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -312,7 +312,6 @@ class RTFInput(InputFormatPlugin):
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
            raise
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -226,10 +226,6 @@ class ParseRtf:
        try:
            return_value = process_tokens_obj.process_tokens()
        except InvalidRtfException, msg:
            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
@ -244,11 +240,16 @@ class ParseRtf:
            enc = encode_obj.get_codepage()
            if enc != 'mac_roman':
                enc = 'cp' + enc
            msg = 'Exception in token processing'
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
-                raise InvalidRtfException, msg
+            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
            copy = self.__copy,
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -3,6 +3,7 @@
 #   copyright 2002 Paul Henry Tremblay                                  #
 #                                                                       #
 #########################################################################
 '''
 Codepages as to RTF 1.9.1:
    437	United States IBM
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -70,7 +70,7 @@ class ProcessTokens:
        ';'                  :	('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :	('mc', '-', self.ms_sub_func),
-        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
+        'line'               :  ('mi', 'hardline-break', self.direct_conv_func), #calibre
        # misc => ml
        '*'                  :	('ml', 'asterisk__', self.default_func),
        ':'                  :	('ml', 'colon_____', self.default_func),
@ -605,7 +605,7 @@ class ProcessTokens:
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token
-    def hardline_func(self, pre, token, num):
+    def direct_conv_func(self, pre, token, num):
        return 'mi<tg<empty_____<%s\n' % token
    def default_func(self, pre, token, num):
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -27,11 +27,13 @@ class Tokenize:
            bug_handler,
            copy = None,
            run_level = 1,
-    ):
+            # out_file = None,
        ):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        # self.__out_file = out_file
        self.__compile_expressions()
        #variables
        self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        # this is for older RTF
        input_file = self.__par_exp.sub('\n\\par \n', input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
@ -127,7 +131,7 @@ class Tokenize:
        # this is for older RTF
        #line = re.sub(self.__par_exp, '\\par ', line)
        #return filter(lambda x: len(x) > 0, \
-            #(self.__remove_line.sub('', x) for x in tokens))
+            #(self.__remove_line.sub('', x) for x in tokens)) 
    def __compile_expressions(self):
        SIMPLE_RPL = {
@ -153,8 +157,6 @@ class Tokenize:
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
            # this is for older RTF
            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #this is for old RTF
        self.__par_exp = re.compile(r'\\\n+')
        # self.__par_exp = re.compile(r'\\$')
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        # if self.__out_file:
            # self.__file = self.__out_file
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
-        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
 # import sys
 # def main(args=sys.argv):
    # if len(args) < 1:
        # print 'No file'
        # return
    # file = 'data_tokens.txt'
    # if len(args) == 3:
        # file = args[2]
    # to = Tokenize(args[1], Exception, out_file = file)
    # to.tokenize()
 # if __name__ == '__main__':
    # sys.exit(main())