Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode

2025-07-09 03:04:10 -04:00 · 2010-08-12 16:25:09 +02:00 · 2010-08-12 16:25:09 +02:00 · 7c70914ad3
commit 7c70914ad3
parent ae8fcb1fd4
1 changed files with 64 additions and 40 deletions
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -26,7 +26,7 @@ class Tokenize:
            in_file,
            bug_handler,
            copy = None,
-            #run_level = 1,
+            run_level = 1,
    ):
        self.__file = in_file
        self.__bug_handler = bug_handler
@ -37,17 +37,22 @@ class Tokenize:
        self.__uc_char = 0
        self.__uc_bin = False
        self.__uc_value = [1]
-        
+
    def __from_ms_to_utf8(self,match_obj):
        uni_char = int(match_obj.group(1))
        if uni_char < 0:
            uni_char +=  65536
        return   '&#x' + str('%X' % uni_char) + ';'
    def __reini_utf8_counters(self):
        self.__uc_char = 0
        self.__uc_bin = False
    def __remove_uc_chars(self, startchar, token):
        for i in xrange(startchar, len(token)):
            if token[i] == " ":
                continue
            elif self.__uc_char:
                self.__uc_char -= 1
            else:
                return token[i:]
        #if only " " and char to skip
        return ''
    def __unicode_process(self, token):
        #change scope in
        if token == '\{':
@ -55,9 +60,9 @@ class Tokenize:
            #basic error handling
            self.__reini_utf8_counters()
            return token
-        #change scope out: evaluate dict and rebuild
+        #change scope out
        elif token == '\}':
-            #self.__uc_value.pop()
+            self.__uc_value.pop()
            self.__reini_utf8_counters()
            return token
        #add a uc control
@ -65,58 +70,65 @@ class Tokenize:
            self.__uc_value[-1] = int(token[3:])
            self.__reini_utf8_counters()
            return token
-        #handle uc skippable char
+        #bin data to slip
        elif self.__uc_bin:
            self.__uc_bin = False
            return ''
        #uc char to remove
        elif self.__uc_char:
-            #if token[:1] == "\" and token[:1] == "\"
+            #handle \bin tag in case of uc char to skip
-            pass
+            if token[:4] == '\bin':
                self.__uc_char -=1
                self.__uc_bin = True
                return ''
            elif token[:1] == "\\" :
                self.__uc_char -=1
                return ''
            else:
                return self.__remove_uc_chars(0, token)
        #go for real \u token
        match_obj = self.__utf_exp.match(token)
        if match_obj is not None:
            self.__reini_utf8_counters()
            #get value and handle negative case
            uni_char = int(match_obj.group(1))
            uni_len = len(match_obj.group(1)) + 2
            if uni_char < 0:
                uni_char += 65536
            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
-            #if not uc0
+            self.__uc_char = self.__uc_value[-1]
            if self.__uc_value[-1]:
                self.__uc_char = self.__uc_value[-1]
            #there is only an unicode char
            if len(token)<= uni_len:
                return uni_char
            #an unicode char and something else
            #must be after as it is splited on \
-            elif not self.__uc_value[-1]:
+            #necessary? maybe for \bin?
-                print('not only token uc0 token: ' + uni_char + token[uni_len:])
+            elif not self.__uc_char:
                return uni_char + token[uni_len:]
            #if not uc0 and chars
            else:
-                for i in xrange(uni_len, len(token)):
+                return uni_char + self.__remove_uc_chars(uni_len, token)
                    if token[i] == " ":
                        continue
                    elif self.__uc_char > 0:
                        self.__uc_char -= 1
                    else:
                        return uni_char + token[i:]
            #print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
        #default
        return token
-    
+
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
-        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
+        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
-        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
+        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
-        # this is for older RTF
+        #remove \n in bin data
-        #line = re.sub(self.__par_exp, '\\par ', line)
+        input_file = self.__bin_exp.sub(lambda x: \
-        input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
+                                        x.group().replace('\n', '') +'\n', input_file)
        #split
        tokens = re.split(self.__splitexp, input_file)
        #remove empty tokens and \n
        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
        # this is for older RTF
        #line = re.sub(self.__par_exp, '\\par ', line)
        #return filter(lambda x: len(x) > 0, \
            #(self.__remove_line.sub('', x) for x in tokens))
-        
+
    def __compile_expressions(self):
        SIMPLE_RPL = {
            "\\\\": "\\backslash ",
@ -145,18 +157,25 @@ class Tokenize:
            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
-        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
+        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
-        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
+        self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
        #manage upr/ud situations
        self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
                       r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
        #add \n in split for whole file reading
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
-        
+
    def tokenize(self):
        """Main class for handling other methods. Reads the file \
        , uses method self.sub_reg to make basic substitutions,\
@ -170,9 +189,9 @@ class Tokenize:
        #remove '' and \n in the process
        tokens = self.__sub_reg_split(input_file)
        #correct unicode
-        #tokens = map(self.__unicode_process, tokens)
+        tokens = map(self.__unicode_process, tokens)
        #remove empty items created by removing \uc
-        #tokens = filter(lambda x: len(x) > 0, tokens)
+        tokens = filter(lambda x: len(x) > 0, tokens)
        #write
        write_obj = open(self.__write_to, 'wb')
@ -241,4 +260,9 @@ class Tokenize:
        neg_uni_char = int(match_obj.group(1)) * -1
        # sys.stderr.write(str( neg_uni_char))
        uni_char = neg_uni_char + 65536
        return   '&#x' + str('%X' % uni_char) + ';'''
        '''def __from_ms_to_utf8(self,match_obj):
        uni_char = int(match_obj.group(1))
        if uni_char < 0:
            uni_char +=  65536
        return   '&#x' + str('%X' % uni_char) + ';'''