diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 3a804792c5..76bdcc08af 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -133,7 +133,6 @@ class ParseRtf: self.__temp_dir = out_dir self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") - self.__check_ascii(in_file) self.__char_data = char_data self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) @@ -152,6 +151,7 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd + def __check_file(self, the_file, type): """Check to see if files exist""" if hasattr(the_file, 'read'): return @@ -164,6 +164,7 @@ class ParseRtf: else: msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg + def __check_dir(self, the_dir): """Check to see if directory exists""" if not the_dir : @@ -173,15 +174,7 @@ class ParseRtf: msg = "\n%s is not a directory" % the_dir raise RtfInvalidCodeException, msg return 1 - def __check_ascii(self, the_file): - """Check to see if the file is correct ascii""" - try: - test = codecs.open(the_file, 'r', 'ascii', 'strict') - test.close() - except UnicodeError: - msg = "\n%s is not a correct ascii file" % the_file - raise RtfInvalidCodeException, msg - return 1 + def parse_rtf(self): """ Parse the file by calling on other classes. @@ -192,6 +185,18 @@ class ParseRtf: depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) + #Check to see if the file is correct ascii first + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = RtfInvalidCodeException, + ) + if check_encoding_obj.check_encoding(self.__file): + try: + os.remove(self.__temp_file) + except OSError: + pass + sys.stderr.write('File "%s" does not appear to be ascii.\n' \ + % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) + raise InvalidRtfException # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -214,7 +219,7 @@ class ParseRtf: in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, copy = self.__copy, - #run_level = self.__run_level, + run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() @@ -223,8 +228,8 @@ class ParseRtf: tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, - copy = self.__copy,) - #run_level = self.__run_level,) + copy = self.__copy, + run_level = self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -240,10 +245,6 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - check_encoding_obj.check_encoding(self.__file) sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( @@ -548,8 +549,7 @@ class ParseRtf: """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') - write_obj = open(write_file, 'w') - for line in read_obj: - write_obj.write(line) + write_obj = open(write_file, 'wb') + write_obj.write(read_obj.read()) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_brackets.py b/src/calibre/ebooks/rtf2xml/check_brackets.py index 53f9363d63..8917780746 100755 --- a/src/calibre/ebooks/rtf2xml/check_brackets.py +++ b/src/calibre/ebooks/rtf2xml/check_brackets.py @@ -30,7 +30,6 @@ class CheckBrackets: self.__bracket_count += 1 def close_brack(self, line): num = line[-5:-1] - ##self.__open_bracket_num.append(num) try: last_num = self.__open_bracket_num.pop() except: diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index f6810e4909..1f8645bb0c 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,12 +14,11 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - line_to_read = 1 + input_file = read_obj.read() + read_obj.close() line_num = 0 - while line_to_read: + for line in input_file: line_num += 1 - line_to_read = read_obj.readline() - line = line_to_read try: line.decode(encoding) except UnicodeError: @@ -27,6 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) + return True + return False + if __name__ == '__main__': check_encoding_obj = CheckEncoding() check_encoding_obj.check_encoding(sys.argv[1]) diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py index e77e5d747c..86546967a7 100755 --- a/src/calibre/ebooks/rtf2xml/line_endings.py +++ b/src/calibre/ebooks/rtf2xml/line_endings.py @@ -23,7 +23,7 @@ class FixLineEndings: bug_handler, in_file = None, copy = None, - #run_level = 1, calibre why keep it? + run_level = 1, replace_illegals = 1, ): self.__file = in_file @@ -32,8 +32,11 @@ class FixLineEndings: self.__write_to = tempfile.mktemp() self.__replace_illegals = replace_illegals def fix_endings(self): - illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') - # always check since I have to get rid of illegal characters + #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 + #always check since I have to get rid of illegal characters + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) + illegal_regx = re.compile(u'|'.join(map(unichr, chars))) + #illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') #read read_obj = open(self.__file, 'r') input_file = read_obj.read() @@ -42,7 +45,7 @@ class FixLineEndings: input_file = input_file.replace ('\r\n', '\n') input_file = input_file.replace ('\r', '\n') if self.__replace_illegals: - input_file = re.sub(illegal_regx, '', input_file) + input_file = illegal_regx.sub('', input_file) #write write_obj = open(self.__write_to, 'wb') write_obj.write(input_file)