Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode

This commit is contained in:
Sengian 2010-08-12 16:25:09 +02:00
parent ae8fcb1fd4
commit 7c70914ad3

View File

@ -26,7 +26,7 @@ class Tokenize:
in_file,
bug_handler,
copy = None,
#run_level = 1,
run_level = 1,
):
self.__file = in_file
self.__bug_handler = bug_handler
@ -37,17 +37,22 @@ class Tokenize:
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1))
if uni_char < 0:
uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';'
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __remove_uc_chars(self, startchar, token):
for i in xrange(startchar, len(token)):
if token[i] == " ":
continue
elif self.__uc_char:
self.__uc_char -= 1
else:
return token[i:]
#if only " " and char to skip
return ''
def __unicode_process(self, token):
#change scope in
if token == '\{':
@ -55,9 +60,9 @@ class Tokenize:
#basic error handling
self.__reini_utf8_counters()
return token
#change scope out: evaluate dict and rebuild
#change scope out
elif token == '\}':
#self.__uc_value.pop()
self.__uc_value.pop()
self.__reini_utf8_counters()
return token
#add a uc control
@ -65,58 +70,65 @@ class Tokenize:
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
#handle uc skippable char
#bin data to slip
elif self.__uc_bin:
self.__uc_bin = False
return ''
#uc char to remove
elif self.__uc_char:
#if token[:1] == "\" and token[:1] == "\"
pass
#handle \bin tag in case of uc char to skip
if token[:4] == '\bin':
self.__uc_char -=1
self.__uc_bin = True
return ''
elif token[:1] == "\\" :
self.__uc_char -=1
return ''
else:
return self.__remove_uc_chars(0, token)
#go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
self.__reini_utf8_counters()
#get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2
if uni_char < 0:
uni_char += 65536
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
#if not uc0
if self.__uc_value[-1]:
self.__uc_char = self.__uc_value[-1]
self.__uc_char = self.__uc_value[-1]
#there is only an unicode char
if len(token)<= uni_len:
return uni_char
#an unicode char and something else
#must be after as it is splited on \
elif not self.__uc_value[-1]:
print('not only token uc0 token: ' + uni_char + token[uni_len:])
#necessary? maybe for \bin?
elif not self.__uc_char:
return uni_char + token[uni_len:]
#if not uc0 and chars
else:
for i in xrange(uni_len, len(token)):
if token[i] == " ":
continue
elif self.__uc_char > 0:
self.__uc_char -= 1
else:
return uni_char + token[i:]
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
return uni_char + self.__remove_uc_chars(uni_len, token)
#default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data
input_file = self.__bin_exp.sub(lambda x: \
x.group().replace('\n', '') +'\n', input_file)
#split
tokens = re.split(self.__splitexp, input_file)
#remove empty tokens and \n
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
# this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line)
#return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
@ -145,18 +157,25 @@ class Tokenize:
r'\\$': '\\par ',
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
#manage upr/ud situations
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
#add \n in split for whole file reading
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#why keep backslash whereas \is replaced before?
#remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
@ -170,9 +189,9 @@ class Tokenize:
#remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
#correct unicode
#tokens = map(self.__unicode_process, tokens)
tokens = map(self.__unicode_process, tokens)
#remove empty items created by removing \uc
#tokens = filter(lambda x: len(x) > 0, tokens)
tokens = filter(lambda x: len(x) > 0, tokens)
#write
write_obj = open(self.__write_to, 'wb')
@ -241,4 +260,9 @@ class Tokenize:
neg_uni_char = int(match_obj.group(1)) * -1
# sys.stderr.write(str( neg_uni_char))
uni_char = neg_uni_char + 65536
return '&#x' + str('%X' % uni_char) + ';'''
'''def __from_ms_to_utf8(self,match_obj):
uni_char = int(match_obj.group(1))
if uni_char < 0:
uni_char += 65536
return '&#x' + str('%X' % uni_char) + ';'''