From a9fd0ad4ba9acdcc07d5bfcae503c378c25a7303 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 16 Aug 2010 10:08:59 +0200 Subject: [PATCH] Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections and regression correction --- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/rtf2xml/ParseRtf.py | 14 +- src/calibre/ebooks/rtf2xml/check_encoding.py | 11 +- src/calibre/ebooks/rtf2xml/copy.py | 14 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 163 ++++++++++--------- 5 files changed, 104 insertions(+), 100 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 824da7d6f1..f4fbdf411c 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'I:\\Calibre\\rtfdebug', + deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 76bdcc08af..1230ae150e 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -120,8 +120,6 @@ class ParseRtf: script tries to output to directory where is script is exectued.) 'deb_dir' --debug directory. If a debug_dir is provided, the script will copy each run through as a file to examine in the debug_dir - 'perl_script'--use perl to make tokens. This runs just a bit faster. - (I will probably phase this out.) 'check_brackets' -- make sure the brackets match up after each run through a file. Only for debugging. Returns: Nothing @@ -142,7 +140,7 @@ class ParseRtf: self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf self.__run_level = run_level - #self.__exit_level = 0 + #self.__exit_level = 0 See what this means and if it is consistent self.__indent = indent self.__replace_illegals = replace_illegals self.__form_lists = form_lists @@ -184,19 +182,15 @@ class ParseRtf: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ - self.__temp_file = self.__make_temp_file(self.__file) #Check to see if the file is correct ascii first check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) if check_encoding_obj.check_encoding(self.__file): - try: - os.remove(self.__temp_file) - except OSError: - pass sys.stderr.write('File "%s" does not appear to be ascii.\n' \ % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException + self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -223,7 +217,6 @@ class ParseRtf: replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() - #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it? #self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, @@ -550,6 +543,7 @@ class ParseRtf: write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'wb') - write_obj.write(read_obj.read()) + for line in read_obj: + write_obj.write(line) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 1f8645bb0c..444fd373e4 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,10 +14,10 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - input_file = read_obj.read() - read_obj.close() + line_num = 0 - for line in input_file: + error_found = False + for line in read_obj: line_num += 1 try: line.decode(encoding) @@ -26,8 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) - return True - return False + error_found = True + read_obj.close() + return error_found if __name__ == '__main__': check_encoding_obj = CheckEncoding() diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py index ff029c1841..1b620b9fbf 100755 --- a/src/calibre/ebooks/rtf2xml/copy.py +++ b/src/calibre/ebooks/rtf2xml/copy.py @@ -23,6 +23,7 @@ class Copy: def __init__(self, bug_handler, file = None, deb_dir = None, ): self.__file = file self.__bug_handler = bug_handler + def set_dir(self, deb_dir): """Set the temporary directory to write files to""" if deb_dir is None: @@ -33,19 +34,11 @@ class Copy: message = "%(deb_dir)s is not a directory" % vars() raise self.__bug_handler , message Copy.__dir = deb_dir + def remove_files(self ): """Remove files from directory""" self.__remove_the_files(Copy.__dir) - """ - list_of_files = os.listdir(Copy.__dir) - list_of_files = os.listdir(the_dir) - for file in list_of_files: - rem_file = os.path.join(Copy.__dir,file) - if os.path.isdir(rem_file): - self.remove_files(rem_file) - else: - os.remove(rem_file) - """ + def __remove_the_files(self, the_dir): """Remove files from directory""" list_of_files = os.listdir(the_dir) @@ -58,6 +51,7 @@ class Copy: os.remove(rem_file) except OSError: pass + def copy_file(self, file, new_file): """ Copy the file to a new name diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 072d8b02e4..2c5c0c7df0 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -735,8 +735,94 @@ class ProcessTokens: pre, token, action = self.dict_token.get(token, (None, None, None)) if action: return action(pre, token, num) - # unused function - def initiate_token_actions(self): + + def __check_brackets(self, in_file): + self.__check_brack_obj = check_brackets.CheckBrackets\ + (file = in_file) + good_br = self.__check_brack_obj.check_brackets()[0] + if not good_br: + return 1 + def process_tokens(self): + """Main method for handling other methods. """ + + read_obj= open(self.__file, 'r') + write_obj = open(self.__write_to, 'wb') + + '''first_token = 0 + second_token = 0''' + line_count = 0 + + for line in read_obj: + token = line.replace("\n","") + #calibre not necessary normaly, fixed in tokenize + '''if not token: + continue''' + line_count += 1 + #calibre not necessary, encoding checked before + """try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = str(msg) + msg += 'Invalid RTF: File not ascii encoded.\n' + raise self.__exception_handler, msg""" + #calibre: with tokenize, should be first and second line, why bother? + """if not first_token: + if token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + first_token = 1 + elif line_count == and not second_token: + if token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + second_token = 1""" + if line_count == 1 and token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + elif line_count == 2 and token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + + ##token = self.evaluate_token(token) + the_index = token.find('\\ ') + if token is not None and the_index > -1: + msg ='Invalid RTF: token "\\ " not valid.\n' + raise self.__exception_handler, msg + elif token[:1] == "\\": + line = self.process_cw(token) + if line is not None: + write_obj.write(line) + else: + fields = re.split(self.__utf_exp, token) + for field in fields: + if not field: + continue + if field[0:1] == '&': + write_obj.write('tx -1: - msg ='Invalid RTF: token "\\ " not valid. \n' - raise self.__exception_handler, msg - elif token[0:1] == "\\": - line = self.process_cw(token) - if line != None: - write_obj.write(line) - else: - fields = re.split(self.__utf_exp, token) - for field in fields: - if not field: - continue - if field[0:1] == '&': - write_obj.write('tx