mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode
This commit is contained in:
parent
ae8fcb1fd4
commit
7c70914ad3
@ -26,7 +26,7 @@ class Tokenize:
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy = None,
|
||||
#run_level = 1,
|
||||
run_level = 1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
@ -38,16 +38,21 @@ class Tokenize:
|
||||
self.__uc_bin = False
|
||||
self.__uc_value = [1]
|
||||
|
||||
def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'
|
||||
|
||||
def __reini_utf8_counters(self):
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
|
||||
def __remove_uc_chars(self, startchar, token):
|
||||
for i in xrange(startchar, len(token)):
|
||||
if token[i] == " ":
|
||||
continue
|
||||
elif self.__uc_char:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return token[i:]
|
||||
#if only " " and char to skip
|
||||
return ''
|
||||
|
||||
def __unicode_process(self, token):
|
||||
#change scope in
|
||||
if token == '\{':
|
||||
@ -55,9 +60,9 @@ class Tokenize:
|
||||
#basic error handling
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#change scope out: evaluate dict and rebuild
|
||||
#change scope out
|
||||
elif token == '\}':
|
||||
#self.__uc_value.pop()
|
||||
self.__uc_value.pop()
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#add a uc control
|
||||
@ -65,58 +70,65 @@ class Tokenize:
|
||||
self.__uc_value[-1] = int(token[3:])
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
#handle uc skippable char
|
||||
#bin data to slip
|
||||
elif self.__uc_bin:
|
||||
self.__uc_bin = False
|
||||
return ''
|
||||
#uc char to remove
|
||||
elif self.__uc_char:
|
||||
#if token[:1] == "\" and token[:1] == "\"
|
||||
pass
|
||||
#handle \bin tag in case of uc char to skip
|
||||
if token[:4] == '\bin':
|
||||
self.__uc_char -=1
|
||||
self.__uc_bin = True
|
||||
return ''
|
||||
elif token[:1] == "\\" :
|
||||
self.__uc_char -=1
|
||||
return ''
|
||||
else:
|
||||
return self.__remove_uc_chars(0, token)
|
||||
#go for real \u token
|
||||
match_obj = self.__utf_exp.match(token)
|
||||
if match_obj is not None:
|
||||
self.__reini_utf8_counters()
|
||||
#get value and handle negative case
|
||||
uni_char = int(match_obj.group(1))
|
||||
uni_len = len(match_obj.group(1)) + 2
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
#if not uc0
|
||||
if self.__uc_value[-1]:
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
#there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
return uni_char
|
||||
#an unicode char and something else
|
||||
#must be after as it is splited on \
|
||||
elif not self.__uc_value[-1]:
|
||||
print('not only token uc0 token: ' + uni_char + token[uni_len:])
|
||||
#necessary? maybe for \bin?
|
||||
elif not self.__uc_char:
|
||||
return uni_char + token[uni_len:]
|
||||
#if not uc0 and chars
|
||||
else:
|
||||
for i in xrange(uni_len, len(token)):
|
||||
if token[i] == " ":
|
||||
continue
|
||||
elif self.__uc_char > 0:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return uni_char + token[i:]
|
||||
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
|
||||
return uni_char + self.__remove_uc_chars(uni_len, token)
|
||||
#default
|
||||
return token
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||
#remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x: \
|
||||
x.group().replace('\n', '') +'\n', input_file)
|
||||
#split
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
"\\\\": "\\backslash ",
|
||||
@ -145,13 +157,20 @@ class Tokenize:
|
||||
r'\\$': '\\par ',
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
#add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
#manage upr/ud situations
|
||||
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
|
||||
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
||||
#add \n in split for whole file reading
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#why keep backslash whereas \is replaced before?
|
||||
#remove \n from endline char
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#self.__par_exp = re.compile(r'\\$')
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
@ -170,9 +189,9 @@ class Tokenize:
|
||||
#remove '' and \n in the process
|
||||
tokens = self.__sub_reg_split(input_file)
|
||||
#correct unicode
|
||||
#tokens = map(self.__unicode_process, tokens)
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
#tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
|
||||
#write
|
||||
write_obj = open(self.__write_to, 'wb')
|
||||
@ -242,3 +261,8 @@ class Tokenize:
|
||||
# sys.stderr.write(str( neg_uni_char))
|
||||
uni_char = neg_uni_char + 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'''
|
||||
'''def __from_ms_to_utf8(self,match_obj):
|
||||
uni_char = int(match_obj.group(1))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
return '&#x' + str('%X' % uni_char) + ';'''
|
Loading…
x
Reference in New Issue
Block a user