From ac07ff853ead790c664051cdb8628a1b1fb30f53 Mon Sep 17 00:00:00 2001 From: Sengian Date: Fri, 7 Jan 2011 08:07:39 +0100 Subject: [PATCH] Handle non ascii charset in RTF if declared as codepage --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +- src/calibre/ebooks/rtf2xml/check_encoding.py | 1 + src/calibre/ebooks/rtf2xml/convert_to_tags.py | 50 ++++++++++++++----- .../ebooks/rtf2xml/default_encoding.py | 3 +- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 901188a000..f9036989b0 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -326,7 +326,6 @@ class ParseRtf: invalid_rtf_handler = InvalidRtfException, ) hex2utf_obj.convert_hex_2_utf8() - # raise RtfInvalidCodeException, 'stop' self.__bracket_match('hex_2_utf_preamble') fonts_obj = fonts.Fonts( in_file = self.__temp_file, @@ -523,6 +522,7 @@ class ParseRtf: indent = self.__indent, run_level = self.__run_level, no_dtd = self.__no_dtd, + encoding = encode_obj.get_codepage(), bug_handler = RtfInvalidCodeException, ) tags_obj.convert_to_tags() diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 4503cbf98a..ae512fa68a 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import sys + class CheckEncoding: def __init__(self, bug_handler): diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index c2244b784a..6563d2e982 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -1,6 +1,9 @@ import os, tempfile -from calibre.ebooks.rtf2xml import copy + +from calibre.ebooks.rtf2xml import copy, check_encoding + public_dtd = 'rtf2xml1.0.dtd' + class ConvertToTags: """ Convert file to XML @@ -10,6 +13,7 @@ class ConvertToTags: bug_handler, dtd_path, no_dtd, + encoding, indent = None, copy = None, run_level = 1, @@ -29,9 +33,14 @@ class ConvertToTags: self.__copy = copy self.__dtd_path = dtd_path self.__no_dtd = no_dtd + if encoding != 'mac_roman': + self.__encoding = 'cp' + encoding + else: + self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = tempfile.mktemp() + def __initiate_values(self): """ Set values, including those for the dictionary. @@ -61,6 +70,7 @@ class ConvertToTags: 'tx' % info) + def __empty_func(self, line): """ Print out empty tag and newlines when needed. @@ -85,6 +96,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __open_att_func(self, line): """ Process lines for open tags that have attributes. @@ -119,6 +131,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __empty_att_func(self, line): """ Same as the __open_att_func, except a '/' is placed at the end of the tag. @@ -143,6 +156,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __close_func(self, line): """ Print out the closed tag and new lines, if appropriate. @@ -156,6 +170,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __text_func(self, line): """ Simply print out the information between [17:-1] @@ -163,6 +178,7 @@ class ConvertToTags: #tx') + #keep maximum compatibility with previous version + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = self.__bug_handler, + ) + if not check_encoding_obj.check_encoding(self.__file): + self.__write_obj.write('') + elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): + self.__write_obj.write('' % self.__encoding) + else: + self.__write_obj.write('') + sys.stderr.write(_('Bad RTF encoding, revert to US-ASCII chars and hope for the best')) self.__new_line = 0 self.__write_new_line() if self.__no_dtd: @@ -207,6 +236,7 @@ class ConvertToTags: ) self.__new_line = 0 self.__write_new_line() + def convert_to_tags(self): """ Read in the file one line at a time. Get the important info, between @@ -222,18 +252,14 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') self.__write_dec() - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action != None: - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index a4eeac9663..e145a8a75e 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -132,8 +132,7 @@ class DefaultEncoding: self.__code_page = '850' # if __name__ == '__main__': - # from calibre.ebooks.rtf2xml import default_encoding - # encode_obj = default_encoding.DefaultEncoding( + # encode_obj = DefaultEncoding( # in_file = sys.argv[1], # bug_handler = Exception, # check_raw = True,