diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 92ac8a2519..d1a6b7c88a 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -286,7 +286,6 @@ class RTFInput(InputFormatPlugin): try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: - raise raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index cdd9a3d088..d673836210 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -226,10 +226,6 @@ class ParseRtf: try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException, msg: - try: - os.remove(self.__temp_file) - except OSError: - pass #Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file = self.__temp_file, @@ -241,14 +237,17 @@ class ParseRtf: check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) - enc = encode_obj.get_codepage() - if enc != 'mac_roman': - enc = 'cp' + enc + enc = 'cp' + encode_obj.get_codepage() + msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg = 'File %s does not appear to be correctly encoded.\n' % file_name - raise InvalidRtfException, msg + try: + os.remove(self.__temp_file) + except OSError: + pass + raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 53887e0d90..3ddfbcd321 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -74,9 +74,6 @@ class DefaultEncoding: if not self.__datafetched: self._encoding() self.__datafetched = True - if self.__platform == 'Macintosh': - code_page = self.__code_page - else: code_page = 'ansicpg' + self.__code_page return self.__platform, code_page, self.__default_num @@ -94,49 +91,60 @@ class DefaultEncoding: def _encoding(self): with open(self.__file, 'r') as read_obj: + cpfound = False if not self.__fetchraw: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'mi 3: - msg = 'flag problem\n' + msg = 'Flag problem\n' raise self.__bug_handler, msg return True elif self.__token_info in self.__allowable : @@ -173,8 +171,8 @@ class DeleteInfo: Return True for all control words. Return False otherwise. """ - if self.__delete_count == self.__cb_count and self.__token_info ==\ - 'cb33\n + def __collect_tokens_func(self, line): """ Requires: @@ -194,18 +227,19 @@ class Info: att = line[6:16] value = line[20:-1] att_changed = self.__token_dict.get(att) - if att_changed == None: + if att_changed is None: if self.__run_level > 3: - msg = 'no dictionary match for %s\n' % att + msg = 'No dictionary match for %s\n' % att raise self.__bug_handler, msg else: self.__text_string += '<%s>%s' % (att_changed, value) + def __single_field_func(self, line, tag): value = line[20:-1] self.__write_obj.write( - 'mi%s\n' % (tag, tag, value) + 'mi%s\n' % (tag, tag, value) ) + def __after_info_table_func(self, line): """ Requires: @@ -217,6 +251,7 @@ class Info: the file. """ self.__write_obj.write(line) + def fix_info(self): """ Requires: @@ -234,20 +269,15 @@ class Info: information table, simply write the line to the output file. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module styles.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'wb') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('No matching state in module styles.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "info.data") diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 9460af07fc..c6cf124425 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -70,7 +70,7 @@ class ProcessTokens: ';' : ('mc', ';', self.ms_sub_func), # this must be wrong '-' : ('mc', '-', self.ms_sub_func), - 'line' : ('mi', 'hardline-break', self.hardline_func), #calibre + 'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre # misc => ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -78,7 +78,6 @@ class ProcessTokens: 'backslash' : ('nu', '\\', self.text_func), 'ob' : ('nu', '{', self.text_func), 'cb' : ('nu', '}', self.text_func), - #'line' : ('nu', ' ', self.text_func), calibre # paragraph formatting => pf 'page' : ('pf', 'page-break', self.default_func), 'par' : ('pf', 'par-end___', self.default_func), @@ -231,11 +230,15 @@ class ProcessTokens: 'trhdr' : ('tb', 'row-header', self.default_func), # preamble => pr # document information => di + # TODO integrate \userprops 'info' : ('di', 'doc-info__', self.default_func), + 'title' : ('di', 'title_____', self.default_func), 'author' : ('di', 'author____', self.default_func), 'operator' : ('di', 'operator__', self.default_func), - 'title' : ('di', 'title_____', self.default_func), + 'manager' : ('di', 'manager___', self.default_func), + 'company' : ('di', 'company___', self.default_func), 'keywords' : ('di', 'keywords__', self.default_func), + 'category' : ('di', 'category__', self.default_func), 'doccomm' : ('di', 'doc-notes_', self.default_func), 'comment' : ('di', 'doc-notes_', self.default_func), 'subject' : ('di', 'subject___', self.default_func), @@ -244,11 +247,19 @@ class ProcessTokens: 'mo' : ('di', 'month_____', self.default_func), 'dy' : ('di', 'day_______', self.default_func), 'min' : ('di', 'minute____', self.default_func), + 'sec' : ('di', 'second____', self.default_func), 'revtim' : ('di', 'revis-time', self.default_func), + 'edmins' : ('di', 'edit-time_', self.default_func), + 'printim' : ('di', 'print-time', self.default_func), + 'buptim' : ('di', 'backuptime', self.default_func), 'nofwords' : ('di', 'num-of-wor', self.default_func), 'nofchars' : ('di', 'num-of-chr', self.default_func), + 'nofcharsws' : ('di', 'numofchrws', self.default_func), 'nofpages' : ('di', 'num-of-pag', self.default_func), - 'edmins' : ('di', 'edit-time_', self.default_func), + 'version' : ('di', 'version___', self.default_func), + 'vern' : ('di', 'intern-ver', self.default_func), + 'hlinkbase' : ('di', 'linkbase__', self.default_func), + 'id' : ('di', 'internalID', self.default_func), # headers and footers => hf 'headerf' : ('hf', 'head-first', self.default_func), 'headerl' : ('hf', 'head-left_', self.default_func), @@ -605,7 +616,7 @@ class ProcessTokens: def ms_sub_func(self, pre, token, num): return 'tx ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data @@ -127,7 +131,7 @@ class Tokenize: # this is for older RTF #line = re.sub(self.__par_exp, '\\par ', line) #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) + #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -153,8 +157,6 @@ class Tokenize: # put a backslash in front of to eliminate special cases and # make processing easier "}": "\\}", - # this is for older RTF - r'\\$': '\\par ', } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u @@ -168,10 +170,12 @@ class Tokenize: #why keep backslash whereas \is replaced before? #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #this is for old RTF + self.__par_exp = re.compile(r'\\\n+') + # self.__par_exp = re.compile(r'\\$') #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") - #self.__par_exp = re.compile(r'\\$') #self.__remove_line = re.compile(r'\n+') #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") @@ -199,7 +203,24 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") + # if self.__out_file: + # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) - #self.__special_tokens = [ '_', '~', "'", '{', '}' ] \ No newline at end of file + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] + +# import sys +# def main(args=sys.argv): + # if len(args) < 1: + # print 'No file' + # return + # file = 'data_tokens.txt' + # if len(args) == 3: + # file = args[2] + # to = Tokenize(args[1], Exception, out_file = file) + # to.tokenize() + + +# if __name__ == '__main__': + # sys.exit(main()) \ No newline at end of file