From ed4da14df07a4c61a21bfe09c542aa4802863a9d Mon Sep 17 00:00:00 2001
From: Sengian <sengian1@gmail.com>
Date: Mon, 31 Jan 2011 08:29:42 +0100
Subject: [PATCH] Correct problems with tag splitting in RTFParser, some
 encoding refactoring & move all encodings to UTF-8 or US-ASCII for lxml

---
 src/calibre/ebooks/rtf/input.py               | 23 ++------
 src/calibre/ebooks/rtf2xml/ParseRtf.py        |  2 +
 src/calibre/ebooks/rtf2xml/colors.py          | 54 +++++++++++--------
 src/calibre/ebooks/rtf2xml/convert_to_tags.py | 38 ++++++++-----
 .../ebooks/rtf2xml/default_encoding.py        |  4 ++
 src/calibre/ebooks/rtf2xml/fonts.py           | 36 +++++++------
 src/calibre/ebooks/rtf2xml/get_char_map.py    |  2 +-
 src/calibre/ebooks/rtf2xml/tokenize.py        | 24 +++++----
 8 files changed, 101 insertions(+), 82 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 6361cb7fdb..caa35a9eda 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -85,6 +85,7 @@ class RTFInput(InputFormatPlugin):
                 debug_dir = 'rtfdebug'
                 run_lev = 4
                 indent_out = 1
+                self.log('Running RTFParser in debug mode')
             except:
                 pass
         parser = ParseRtf(
@@ -233,22 +234,6 @@ class RTFInput(InputFormatPlugin):
         with open('styles.css', 'ab') as f:
             f.write(css)
 
-    # def preprocess(self, fname):
-        # self.log('\tPreprocessing to convert unicode characters')
-        # try:
-            # data = open(fname, 'rb').read()
-            # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
-            # tokenizer = RtfTokenizer(data)
-            # tokens = RtfTokenParser(tokenizer.tokens)
-            # data = tokens.toRTF()
-            # fname = 'preprocessed.rtf'
-            # with open(fname, 'wb') as f:
-                # f.write(data)
-        # except:
-            # self.log.exception(
-            # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
-        # return fname
-
     def convert_borders(self, doc):
         border_styles = []
         style_map = {}
@@ -283,8 +268,6 @@ class RTFInput(InputFormatPlugin):
         self.opts = options
         self.log = log
         self.log('Converting RTF to XML...')
-        #Name of the preprocesssed RTF file
-        # fname = self.preprocess(stream.name)
         try:
             xml = self.generate_xml(stream.name)
         except RtfInvalidCodeException, e:
@@ -338,4 +321,6 @@ class RTFInput(InputFormatPlugin):
         opf.render(open('metadata.opf', 'wb'))
         return os.path.abspath('metadata.opf')
 
-#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug"
\ No newline at end of file
+#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug"
+# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug')
+# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug'
\ No newline at end of file
diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index a28b6f81da..56e18fe74d 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -238,6 +238,8 @@ class ParseRtf:
                     bug_handler = RtfInvalidCodeException,
                         )
             enc = 'cp' + encode_obj.get_codepage()
+            if enc == 'cp10000':
+                enc = 'mac_roman'
             msg = 'Exception in token processing'
             if check_encoding_obj.check_encoding(self.__file, enc):
                 file_name = self.__file if isinstance(self.__file, str) \
diff --git a/src/calibre/ebooks/rtf2xml/colors.py b/src/calibre/ebooks/rtf2xml/colors.py
index d81b293bbf..eba03547c8 100755
--- a/src/calibre/ebooks/rtf2xml/colors.py
+++ b/src/calibre/ebooks/rtf2xml/colors.py
@@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, tempfile,  re
+import sys, os, tempfile, re
+
 from calibre.ebooks.rtf2xml import copy
+
 class Colors:
     """
     Change lines with color info from color numbers to the actual color names.
@@ -40,8 +42,10 @@ class Colors:
         self.__file = in_file
         self.__copy = copy
         self.__bug_handler = bug_handler
+        self.__line = 0
         self.__write_to = tempfile.mktemp()
         self.__run_level = run_level
+
     def __initiate_values(self):
         """
         Initiate all values.
@@ -61,6 +65,7 @@ class Colors:
         self.__color_num = 1
         self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
         # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
+
     def __before_color_func(self, line):
         """
         Requires:
@@ -76,6 +81,7 @@ class Colors:
         if self.__token_info == 'mi<mk<clrtbl-beg':
             self.__state = 'in_color_table'
         self.__write_obj.write(line)
+
     def __default_color_func(self, line):
         """
         Requires:
@@ -87,6 +93,7 @@ class Colors:
             """
         hex_num = line[-3:-1]
         self.__color_string += hex_num
+
     def __blue_func(self, line):
         """
         Requires:
@@ -109,6 +116,7 @@ class Colors:
         )
         self.__color_num += 1
         self.__color_string = '#'
+
     def __in_color_func(self, line):
         """
         Requires:
@@ -127,12 +135,13 @@ class Colors:
             self.__state = 'after_color_table'
         else:
             action = self.__state_dict.get(self.__token_info)
-            if action == None:
+            if action is None:
                 sys.stderr.write('in module colors.py\n'
                 'function is self.__in_color_func\n'
                 'no action for %s' % self.__token_info
                 )
             action(line)
+
     def __after_color_func(self, line):
         """
         Check the to see if it contains color info. If it does, extract the
@@ -180,6 +189,7 @@ class Colors:
         else:
             self.__write_obj.write(line)
         # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
+
     def __sub_from_line_color(self, match_obj):
         num = match_obj.group(1)
         try:
@@ -191,25 +201,27 @@ class Colors:
             else:
                 return 'bdr-color_:no-value'
         hex_num = self.__figure_num(num)
-        return_value = 'bdr-color_:%s' % hex_num
-        return return_value
+        return 'bdr-color_:%s' % hex_num
+
     def __figure_num(self, num):
         if num == 0:
             hex_num = 'false'
         else:
             hex_num = self.__color_dict.get(num)
-        if hex_num == None:
-            if self.__run_level > 3:
-                msg = 'no value in self.__color_dict for key %s\n' % num
-                raise self.__bug_hanlder, msg
-        if hex_num == None:
+        if hex_num is None:
             hex_num = '0'
+            if self.__run_level > 5:
+                msg = 'no value in self.__color_dict' \
+                'for key %s at line %d\n' % (num, self.__line)
+                raise self.__bug_handler, msg
         return hex_num
+
     def __do_nothing_func(self, line):
         """
         Bad RTF will have text in the color table
         """
         pass
+
     def convert_colors(self):
         """
         Requires:
@@ -226,20 +238,16 @@ class Colors:
             info, and substitute the number with the hex number.
         """
         self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module fonts.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__line+=1
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('no matching state in module fonts.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "color.data")
diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
index 6927537474..1abc672f85 100755
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@@ -33,13 +33,13 @@ class ConvertToTags:
         self.__copy = copy
         self.__dtd_path = dtd_path
         self.__no_dtd = no_dtd
-        if encoding != 'mac_roman':
-            self.__encoding = 'cp' + encoding
-        else:
+        self.__encoding = 'cp' + encoding
+        if encoding == 'mac_roman':
             self.__encoding = 'mac_roman'
         self.__indent = indent
         self.__run_level = run_level
         self.__write_to = tempfile.mktemp()
+        self.__convert_utf = False
 
     def __initiate_values(self):
         """
@@ -213,7 +213,8 @@ class ConvertToTags:
         if not check_encoding_obj.check_encoding(self.__file, verbose=False):
             self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
         elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
-            self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
+            self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
+            self.__convert_utf = True
         else:
             self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
             sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
@@ -253,15 +254,28 @@ class ConvertToTags:
             an empty tag function.
             """
         self.__initiate_values()
-        self.__write_obj = open(self.__write_to, 'w')
-        self.__write_dec()
-        with open(self.__file, 'r') as read_obj:
-            for line in read_obj:
-                self.__token_info = line[:16]
-                action = self.__state_dict.get(self.__token_info)
-                if action is not None:
-                    action(line)
+        with open(self.__write_to, 'w') as self.__write_obj:
+            self.__write_dec()
+            with open(self.__file, 'r') as read_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__token_info)
+                    if action is not None:
+                        action(line)
         self.__write_obj.close()
+        #convert all encodings to UTF8 to avoid unsupported encodings in lxml
+        if self.__convert_utf:
+            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
+            copy_obj.rename(self.__write_to, self.__file)
+            with open(self.__file, 'r') as read_obj:
+                with open(self.__write_to, 'w') as write_obj:
+                    file = read_obj.read()
+                    try:
+                        file = file.decode(self.__encoding)
+                        write_obj.write(file.encode('utf-8'))
+                    except:
+                        sys.stderr.write('Conversion to UTF-8 is not possible,'
+                        ' encoding should be very carefully checked')
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py
index 3ddfbcd321..c0a43db800 100755
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@@ -75,12 +75,16 @@ class DefaultEncoding:
             self._encoding()
             self.__datafetched = True
             code_page = 'ansicpg' + self.__code_page
+            if self.__code_page == '10000':
+                self.__code_page = 'mac_roman'
         return self.__platform, code_page, self.__default_num
 
     def get_codepage(self):
         if not self.__datafetched:
             self._encoding()
             self.__datafetched = True
+            if self.__code_page == '10000':
+                self.__code_page = 'mac_roman'
         return self.__code_page
 
     def get_platform(self):
diff --git a/src/calibre/ebooks/rtf2xml/fonts.py b/src/calibre/ebooks/rtf2xml/fonts.py
index b85717ce48..45ed3c1957 100755
--- a/src/calibre/ebooks/rtf2xml/fonts.py
+++ b/src/calibre/ebooks/rtf2xml/fonts.py
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class Fonts:
     """
     Change lines with font info from font numbers to the actual font names.
@@ -45,6 +47,7 @@ class Fonts:
         self.__default_font_num = default_font_num
         self.__write_to = tempfile.mktemp()
         self.__run_level = run_level
+
     def __initiate_values(self):
         """
         Initiate all values.
@@ -67,6 +70,7 @@ class Fonts:
         self.__font_table = {}
         # individual font written
         self.__wrote_ind_font = 0
+
     def __default_func(self, line):
         """
         Requires:
@@ -79,6 +83,7 @@ class Fonts:
         if self.__token_info == 'mi<mk<fonttb-beg':
             self.__state = 'font_table'
         self.__write_obj.write(line)
+
     def __font_table_func(self, line):
         """
         Requires:
@@ -101,6 +106,7 @@ class Fonts:
             self.__font_num = self.__default_font_num
             self.__text_line = ''
         ##self.__write_obj.write(line)
+
     def __font_in_table_func(self, line):
         """
         Requires:
@@ -138,6 +144,7 @@ class Fonts:
         elif self.__token_info == 'mi<mk<fonttb-end':
             self.__found_end_font_table_func()
             self.__state = 'after_font_table'
+
     def __found_end_font_table_func(self):
         """
         Required:
@@ -150,7 +157,8 @@ class Fonts:
         if not self.__wrote_ind_font:
             self.__write_obj.write(
             'mi<tg<empty-att_'
-            '<font-in-table<name>Times<num>0\n' )
+            '<font-in-table<name>Times<num>0\n')
+
     def __after_font_table_func(self, line):
         """
         Required:
@@ -169,7 +177,7 @@ class Fonts:
         if self.__token_info == 'cw<ci<font-style':
             font_num = line[20:-1]
             font_name = self.__font_table.get(font_num)
-            if font_name == None:
+            if font_name is None:
                 if self.__run_level > 3:
                     msg = 'no value for %s in self.__font_table\n' % font_num
                     raise self.__bug_handler, msg
@@ -182,6 +190,7 @@ class Fonts:
                 )
         else:
             self.__write_obj.write(line)
+
     def convert_fonts(self):
         """
         Required:
@@ -197,20 +206,15 @@ class Fonts:
             info. Substitute a font name for a font number.
             """
         self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module fonts.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('no matching state in module fonts.py\n' \
+                                            + self.__state + '\n')
+                    action(line)
         default_font_name = self.__font_table.get(self.__default_font_num)
         if not default_font_name:
             default_font_name = 'Not Defined'
diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py
index cb118b0df8..bd487bb6f5 100755
--- a/src/calibre/ebooks/rtf2xml/get_char_map.py
+++ b/src/calibre/ebooks/rtf2xml/get_char_map.py
@@ -41,7 +41,7 @@ class GetCharMap:
     def get_char_map(self, map):
         if map == 'ansicpg0':
             map = 'ansicpg1250'
-        if map in ('ansicpg10000', '10000'):
+        if map == 'ansicpg10000':
             map = 'mac_roman'
         found_map = False
         map_dict = {}
diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py
index 9ebd718833..84acd26a57 100755
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@@ -115,6 +115,7 @@ class Tokenize:
 
     def __sub_reg_split(self,input_file):
         input_file = self.__replace_spchar.mreplace(input_file)
+        # this is for older RTF
         input_file = self.__par_exp.sub('\n\\par \n', input_file)
         input_file = self.__cs_ast.sub("\g<1>", input_file)
         input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
@@ -126,12 +127,6 @@ class Tokenize:
         tokens = re.split(self.__splitexp, input_file)
         #remove empty tokens and \n
         return filter(lambda x: len(x) > 0 and x != '\n', tokens)
-        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
-        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
-        # this is for older RTF
-        #line = re.sub(self.__par_exp, '\\par ', line)
-        #return filter(lambda x: len(x) > 0, \
-            #(self.__remove_line.sub('', x) for x in tokens)) 
 
     def __compile_expressions(self):
         SIMPLE_RPL = {
@@ -160,7 +155,7 @@ class Tokenize:
             }
         self.__replace_spchar = MReplace(SIMPLE_RPL)
         #add ;? in case of char following \u
-        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
+        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
         self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
         self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
         #manage upr/ud situations
@@ -174,14 +169,21 @@ class Tokenize:
         self.__par_exp = re.compile(r'\\\n+')
         #handle improper cs char-style with \* before without {
         self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
-        # self.__par_exp = re.compile(r'\\$')
+        #handle cw using a digit as argument and without space as delimiter
+        self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
         #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
         #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
         #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
         #self.__remove_line = re.compile(r'\n+')
-        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
         ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
 
+    def __correct_spliting(self, token):
+        match_obj = re.search(self.__cwdigit_exp, token)
+        if match_obj is None:
+            return token
+        else:
+            return '%s\n%s' % (match_obj.group(1), match_obj.group(2))
+
     def tokenize(self):
         """Main class for handling other methods. Reads the file \
         , uses method self.sub_reg to make basic substitutions,\
@@ -197,6 +199,8 @@ class Tokenize:
         tokens = map(self.__unicode_process, tokens)
         #remove empty items created by removing \uc
         tokens = filter(lambda x: len(x) > 0, tokens)
+        #handles bothersome cases
+        tokens = map(self.__correct_spliting, tokens)
         
         #write
         with open(self.__write_to, 'wb') as write_obj:
@@ -205,8 +209,6 @@ class Tokenize:
         copy_obj = copy.Copy(bug_handler = self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "tokenize.data")
-        # if self.__out_file:
-            # self.__file = self.__out_file
         copy_obj.rename(self.__write_to, self.__file)
         os.remove(self.__write_to)