From ed4da14df07a4c61a21bfe09c542aa4802863a9d Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 31 Jan 2011 08:29:42 +0100 Subject: [PATCH] Correct problems with tag splitting in RTFParser, some encoding refactoring & move all encodings to UTF-8 or US-ASCII for lxml --- src/calibre/ebooks/rtf/input.py | 23 ++------ src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 + src/calibre/ebooks/rtf2xml/colors.py | 54 +++++++++++-------- src/calibre/ebooks/rtf2xml/convert_to_tags.py | 38 ++++++++----- .../ebooks/rtf2xml/default_encoding.py | 4 ++ src/calibre/ebooks/rtf2xml/fonts.py | 36 +++++++------ src/calibre/ebooks/rtf2xml/get_char_map.py | 2 +- src/calibre/ebooks/rtf2xml/tokenize.py | 24 +++++---- 8 files changed, 101 insertions(+), 82 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 6361cb7fdb..caa35a9eda 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -85,6 +85,7 @@ class RTFInput(InputFormatPlugin): debug_dir = 'rtfdebug' run_lev = 4 indent_out = 1 + self.log('Running RTFParser in debug mode') except: pass parser = ParseRtf( @@ -233,22 +234,6 @@ class RTFInput(InputFormatPlugin): with open('styles.css', 'ab') as f: f.write(css) - # def preprocess(self, fname): - # self.log('\tPreprocessing to convert unicode characters') - # try: - # data = open(fname, 'rb').read() - # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser - # tokenizer = RtfTokenizer(data) - # tokens = RtfTokenParser(tokenizer.tokens) - # data = tokens.toRTF() - # fname = 'preprocessed.rtf' - # with open(fname, 'wb') as f: - # f.write(data) - # except: - # self.log.exception( - # 'Failed to preprocess RTF to convert unicode sequences, ignoring...') - # return fname - def convert_borders(self, doc): border_styles = [] style_map = {} @@ -283,8 +268,6 @@ class RTFInput(InputFormatPlugin): self.opts = options self.log = log self.log('Converting RTF to XML...') - #Name of the preprocesssed RTF file - # fname = self.preprocess(stream.name) try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: @@ -338,4 +321,6 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') -#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" \ No newline at end of file +#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" +# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug') +# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug' \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index a28b6f81da..56e18fe74d 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -238,6 +238,8 @@ class ParseRtf: bug_handler = RtfInvalidCodeException, ) enc = 'cp' + encode_obj.get_codepage() + if enc == 'cp10000': + enc = 'mac_roman' msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ diff --git a/src/calibre/ebooks/rtf2xml/colors.py b/src/calibre/ebooks/rtf2xml/colors.py index d81b293bbf..eba03547c8 100755 --- a/src/calibre/ebooks/rtf2xml/colors.py +++ b/src/calibre/ebooks/rtf2xml/colors.py @@ -15,8 +15,10 @@ # # # # ######################################################################### -import sys, os, tempfile, re +import sys, os, tempfile, re + from calibre.ebooks.rtf2xml import copy + class Colors: """ Change lines with color info from color numbers to the actual color names. @@ -40,8 +42,10 @@ class Colors: self.__file = in_file self.__copy = copy self.__bug_handler = bug_handler + self.__line = 0 self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -61,6 +65,7 @@ class Colors: self.__color_num = 1 self.__line_color_exp = re.compile(r'bdr-color_:(\d+)') # cw 3: - msg = 'no value in self.__color_dict for key %s\n' % num - raise self.__bug_hanlder, msg - if hex_num == None: + if hex_num is None: hex_num = '0' + if self.__run_level > 5: + msg = 'no value in self.__color_dict' \ + 'for key %s at line %d\n' % (num, self.__line) + raise self.__bug_handler, msg return hex_num + def __do_nothing_func(self, line): """ Bad RTF will have text in the color table """ pass + def convert_colors(self): """ Requires: @@ -226,20 +238,16 @@ class Colors: info, and substitute the number with the hex number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__line+=1 + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "color.data") diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index 6927537474..1abc672f85 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -33,13 +33,13 @@ class ConvertToTags: self.__copy = copy self.__dtd_path = dtd_path self.__no_dtd = no_dtd - if encoding != 'mac_roman': - self.__encoding = 'cp' + encoding - else: + self.__encoding = 'cp' + encoding + if encoding == 'mac_roman': self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = tempfile.mktemp() + self.__convert_utf = False def __initiate_values(self): """ @@ -213,7 +213,8 @@ class ConvertToTags: if not check_encoding_obj.check_encoding(self.__file, verbose=False): self.__write_obj.write('') elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): - self.__write_obj.write('' % self.__encoding) + self.__write_obj.write('') + self.__convert_utf = True else: self.__write_obj.write('') sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' @@ -253,15 +254,28 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - self.__write_obj = open(self.__write_to, 'w') - self.__write_dec() - with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action is not None: - action(line) + with open(self.__write_to, 'w') as self.__write_obj: + self.__write_dec() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() + #convert all encodings to UTF8 to avoid unsupported encodings in lxml + if self.__convert_utf: + copy_obj = copy.Copy(bug_handler = self.__bug_handler) + copy_obj.rename(self.__write_to, self.__file) + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as write_obj: + file = read_obj.read() + try: + file = file.decode(self.__encoding) + write_obj.write(file.encode('utf-8')) + except: + sys.stderr.write('Conversion to UTF-8 is not possible,' + ' encoding should be very carefully checked') copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "convert_to_tags.data") diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 3ddfbcd321..c0a43db800 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -75,12 +75,16 @@ class DefaultEncoding: self._encoding() self.__datafetched = True code_page = 'ansicpg' + self.__code_page + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__platform, code_page, self.__default_num def get_codepage(self): if not self.__datafetched: self._encoding() self.__datafetched = True + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__code_page def get_platform(self): diff --git a/src/calibre/ebooks/rtf2xml/fonts.py b/src/calibre/ebooks/rtf2xml/fonts.py index b85717ce48..45ed3c1957 100755 --- a/src/calibre/ebooks/rtf2xml/fonts.py +++ b/src/calibre/ebooks/rtf2xml/fonts.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class Fonts: """ Change lines with font info from font numbers to the actual font names. @@ -45,6 +47,7 @@ class Fonts: self.__default_font_num = default_font_num self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -67,6 +70,7 @@ class Fonts: self.__font_table = {} # individual font written self.__wrote_ind_font = 0 + def __default_func(self, line): """ Requires: @@ -79,6 +83,7 @@ class Fonts: if self.__token_info == 'miTimes0\n' ) + 'Times0\n') + def __after_font_table_func(self, line): """ Required: @@ -169,7 +177,7 @@ class Fonts: if self.__token_info == 'cw 3: msg = 'no value for %s in self.__font_table\n' % font_num raise self.__bug_handler, msg @@ -182,6 +190,7 @@ class Fonts: ) else: self.__write_obj.write(line) + def convert_fonts(self): """ Required: @@ -197,20 +206,15 @@ class Fonts: info. Substitute a font name for a font number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n' \ + + self.__state + '\n') + action(line) default_font_name = self.__font_table.get(self.__default_font_num) if not default_font_name: default_font_name = 'Not Defined' diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index cb118b0df8..bd487bb6f5 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -41,7 +41,7 @@ class GetCharMap: def get_char_map(self, map): if map == 'ansicpg0': map = 'ansicpg1250' - if map in ('ansicpg10000', '10000'): + if map == 'ansicpg10000': map = 'mac_roman' found_map = False map_dict = {} diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 9ebd718833..84acd26a57 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -115,6 +115,7 @@ class Tokenize: def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) + # this is for older RTF input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__cs_ast.sub("\g<1>", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) @@ -126,12 +127,6 @@ class Tokenize: tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n return filter(lambda x: len(x) > 0 and x != '\n', tokens) - #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - # this is for older RTF - #line = re.sub(self.__par_exp, '\\par ', line) - #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -160,7 +155,7 @@ class Tokenize: } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u - self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" + self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") #manage upr/ud situations @@ -174,14 +169,21 @@ class Tokenize: self.__par_exp = re.compile(r'\\\n+') #handle improper cs char-style with \* before without { self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') - # self.__par_exp = re.compile(r'\\$') + #handle cw using a digit as argument and without space as delimiter + self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__remove_line = re.compile(r'\n+') - #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") + def __correct_spliting(self, token): + match_obj = re.search(self.__cwdigit_exp, token) + if match_obj is None: + return token + else: + return '%s\n%s' % (match_obj.group(1), match_obj.group(2)) + def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -197,6 +199,8 @@ class Tokenize: tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc tokens = filter(lambda x: len(x) > 0, tokens) + #handles bothersome cases + tokens = map(self.__correct_spliting, tokens) #write with open(self.__write_to, 'wb') as write_obj: @@ -205,8 +209,6 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") - # if self.__out_file: - # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)