diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5907bf6b55..a6b8c86e79 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -312,7 +312,6 @@ class RTFInput(InputFormatPlugin): try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: - raise raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 902ad09c30..73f8f04e1c 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -226,10 +226,6 @@ class ParseRtf: try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException, msg: - try: - os.remove(self.__temp_file) - except OSError: - pass #Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file = self.__temp_file, @@ -244,11 +240,16 @@ class ParseRtf: enc = encode_obj.get_codepage() if enc != 'mac_roman': enc = 'cp' + enc + msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg = 'File %s does not appear to be correctly encoded.\n' % file_name - raise InvalidRtfException, msg + try: + os.remove(self.__temp_file) + except OSError: + pass + raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index aec33943a9..53887e0d90 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -3,6 +3,7 @@ # copyright 2002 Paul Henry Tremblay # # # ######################################################################### + ''' Codepages as to RTF 1.9.1: 437 United States IBM diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index ff4fbe110c..5066843976 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -70,7 +70,7 @@ class ProcessTokens: ';' : ('mc', ';', self.ms_sub_func), # this must be wrong '-' : ('mc', '-', self.ms_sub_func), - 'line' : ('mi', 'hardline-break', self.hardline_func), #calibre + 'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre # misc => ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -605,7 +605,7 @@ class ProcessTokens: def ms_sub_func(self, pre, token, num): return 'tx ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data @@ -127,7 +131,7 @@ class Tokenize: # this is for older RTF #line = re.sub(self.__par_exp, '\\par ', line) #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) + #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -153,8 +157,6 @@ class Tokenize: # put a backslash in front of to eliminate special cases and # make processing easier "}": "\\}", - # this is for older RTF - r'\\$': '\\par ', } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u @@ -168,10 +170,12 @@ class Tokenize: #why keep backslash whereas \is replaced before? #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #this is for old RTF + self.__par_exp = re.compile(r'\\\n+') + # self.__par_exp = re.compile(r'\\$') #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") - #self.__par_exp = re.compile(r'\\$') #self.__remove_line = re.compile(r'\n+') #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") @@ -199,7 +203,24 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") + # if self.__out_file: + # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) - #self.__special_tokens = [ '_', '~', "'", '{', '}' ] \ No newline at end of file + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] + +# import sys +# def main(args=sys.argv): + # if len(args) < 1: + # print 'No file' + # return + # file = 'data_tokens.txt' + # if len(args) == 3: + # file = args[2] + # to = Tokenize(args[1], Exception, out_file = file) + # to.tokenize() + + +# if __name__ == '__main__': + # sys.exit(main()) \ No newline at end of file