mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode
This commit is contained in:
parent
ae8fcb1fd4
commit
7c70914ad3
@ -26,7 +26,7 @@ class Tokenize:
|
|||||||
in_file,
|
in_file,
|
||||||
bug_handler,
|
bug_handler,
|
||||||
copy = None,
|
copy = None,
|
||||||
#run_level = 1,
|
run_level = 1,
|
||||||
):
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
@ -37,17 +37,22 @@ class Tokenize:
|
|||||||
self.__uc_char = 0
|
self.__uc_char = 0
|
||||||
self.__uc_bin = False
|
self.__uc_bin = False
|
||||||
self.__uc_value = [1]
|
self.__uc_value = [1]
|
||||||
|
|
||||||
def __from_ms_to_utf8(self,match_obj):
|
|
||||||
uni_char = int(match_obj.group(1))
|
|
||||||
if uni_char < 0:
|
|
||||||
uni_char += 65536
|
|
||||||
return '&#x' + str('%X' % uni_char) + ';'
|
|
||||||
|
|
||||||
def __reini_utf8_counters(self):
|
def __reini_utf8_counters(self):
|
||||||
self.__uc_char = 0
|
self.__uc_char = 0
|
||||||
self.__uc_bin = False
|
self.__uc_bin = False
|
||||||
|
|
||||||
|
def __remove_uc_chars(self, startchar, token):
|
||||||
|
for i in xrange(startchar, len(token)):
|
||||||
|
if token[i] == " ":
|
||||||
|
continue
|
||||||
|
elif self.__uc_char:
|
||||||
|
self.__uc_char -= 1
|
||||||
|
else:
|
||||||
|
return token[i:]
|
||||||
|
#if only " " and char to skip
|
||||||
|
return ''
|
||||||
|
|
||||||
def __unicode_process(self, token):
|
def __unicode_process(self, token):
|
||||||
#change scope in
|
#change scope in
|
||||||
if token == '\{':
|
if token == '\{':
|
||||||
@ -55,9 +60,9 @@ class Tokenize:
|
|||||||
#basic error handling
|
#basic error handling
|
||||||
self.__reini_utf8_counters()
|
self.__reini_utf8_counters()
|
||||||
return token
|
return token
|
||||||
#change scope out: evaluate dict and rebuild
|
#change scope out
|
||||||
elif token == '\}':
|
elif token == '\}':
|
||||||
#self.__uc_value.pop()
|
self.__uc_value.pop()
|
||||||
self.__reini_utf8_counters()
|
self.__reini_utf8_counters()
|
||||||
return token
|
return token
|
||||||
#add a uc control
|
#add a uc control
|
||||||
@ -65,58 +70,65 @@ class Tokenize:
|
|||||||
self.__uc_value[-1] = int(token[3:])
|
self.__uc_value[-1] = int(token[3:])
|
||||||
self.__reini_utf8_counters()
|
self.__reini_utf8_counters()
|
||||||
return token
|
return token
|
||||||
#handle uc skippable char
|
#bin data to slip
|
||||||
|
elif self.__uc_bin:
|
||||||
|
self.__uc_bin = False
|
||||||
|
return ''
|
||||||
|
#uc char to remove
|
||||||
elif self.__uc_char:
|
elif self.__uc_char:
|
||||||
#if token[:1] == "\" and token[:1] == "\"
|
#handle \bin tag in case of uc char to skip
|
||||||
pass
|
if token[:4] == '\bin':
|
||||||
|
self.__uc_char -=1
|
||||||
|
self.__uc_bin = True
|
||||||
|
return ''
|
||||||
|
elif token[:1] == "\\" :
|
||||||
|
self.__uc_char -=1
|
||||||
|
return ''
|
||||||
|
else:
|
||||||
|
return self.__remove_uc_chars(0, token)
|
||||||
#go for real \u token
|
#go for real \u token
|
||||||
match_obj = self.__utf_exp.match(token)
|
match_obj = self.__utf_exp.match(token)
|
||||||
if match_obj is not None:
|
if match_obj is not None:
|
||||||
|
self.__reini_utf8_counters()
|
||||||
#get value and handle negative case
|
#get value and handle negative case
|
||||||
uni_char = int(match_obj.group(1))
|
uni_char = int(match_obj.group(1))
|
||||||
uni_len = len(match_obj.group(1)) + 2
|
uni_len = len(match_obj.group(1)) + 2
|
||||||
if uni_char < 0:
|
if uni_char < 0:
|
||||||
uni_char += 65536
|
uni_char += 65536
|
||||||
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||||
#if not uc0
|
self.__uc_char = self.__uc_value[-1]
|
||||||
if self.__uc_value[-1]:
|
|
||||||
self.__uc_char = self.__uc_value[-1]
|
|
||||||
#there is only an unicode char
|
#there is only an unicode char
|
||||||
if len(token)<= uni_len:
|
if len(token)<= uni_len:
|
||||||
return uni_char
|
return uni_char
|
||||||
#an unicode char and something else
|
#an unicode char and something else
|
||||||
#must be after as it is splited on \
|
#must be after as it is splited on \
|
||||||
elif not self.__uc_value[-1]:
|
#necessary? maybe for \bin?
|
||||||
print('not only token uc0 token: ' + uni_char + token[uni_len:])
|
elif not self.__uc_char:
|
||||||
return uni_char + token[uni_len:]
|
return uni_char + token[uni_len:]
|
||||||
#if not uc0 and chars
|
#if not uc0 and chars
|
||||||
else:
|
else:
|
||||||
for i in xrange(uni_len, len(token)):
|
return uni_char + self.__remove_uc_chars(uni_len, token)
|
||||||
if token[i] == " ":
|
|
||||||
continue
|
|
||||||
elif self.__uc_char > 0:
|
|
||||||
self.__uc_char -= 1
|
|
||||||
else:
|
|
||||||
return uni_char + token[i:]
|
|
||||||
#print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token)
|
|
||||||
#default
|
#default
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def __sub_reg_split(self,input_file):
|
def __sub_reg_split(self,input_file):
|
||||||
input_file = self.__replace_spchar.mreplace(input_file)
|
input_file = self.__replace_spchar.mreplace(input_file)
|
||||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||||
# this is for older RTF
|
#remove \n in bin data
|
||||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
input_file = self.__bin_exp.sub(lambda x: \
|
||||||
input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file)
|
x.group().replace('\n', '') +'\n', input_file)
|
||||||
#split
|
#split
|
||||||
tokens = re.split(self.__splitexp, input_file)
|
tokens = re.split(self.__splitexp, input_file)
|
||||||
#remove empty tokens and \n
|
#remove empty tokens and \n
|
||||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||||
|
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||||
|
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||||
|
# this is for older RTF
|
||||||
|
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||||
#return filter(lambda x: len(x) > 0, \
|
#return filter(lambda x: len(x) > 0, \
|
||||||
#(self.__remove_line.sub('', x) for x in tokens))
|
#(self.__remove_line.sub('', x) for x in tokens))
|
||||||
|
|
||||||
|
|
||||||
def __compile_expressions(self):
|
def __compile_expressions(self):
|
||||||
SIMPLE_RPL = {
|
SIMPLE_RPL = {
|
||||||
"\\\\": "\\backslash ",
|
"\\\\": "\\backslash ",
|
||||||
@ -145,18 +157,25 @@ class Tokenize:
|
|||||||
r'\\$': '\\par ',
|
r'\\$': '\\par ',
|
||||||
}
|
}
|
||||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||||
|
#add ;? in case of char following \u
|
||||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this
|
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||||
|
#manage upr/ud situations
|
||||||
|
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
|
||||||
|
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
||||||
#add \n in split for whole file reading
|
#add \n in split for whole file reading
|
||||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
|
||||||
#why keep backslash whereas \is replaced before?
|
#why keep backslash whereas \is replaced before?
|
||||||
|
#remove \n from endline char
|
||||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||||
|
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||||
|
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||||
|
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||||
#self.__par_exp = re.compile(r'\\$')
|
#self.__par_exp = re.compile(r'\\$')
|
||||||
#self.__remove_line = re.compile(r'\n+')
|
#self.__remove_line = re.compile(r'\n+')
|
||||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||||
|
|
||||||
def tokenize(self):
|
def tokenize(self):
|
||||||
"""Main class for handling other methods. Reads the file \
|
"""Main class for handling other methods. Reads the file \
|
||||||
, uses method self.sub_reg to make basic substitutions,\
|
, uses method self.sub_reg to make basic substitutions,\
|
||||||
@ -170,9 +189,9 @@ class Tokenize:
|
|||||||
#remove '' and \n in the process
|
#remove '' and \n in the process
|
||||||
tokens = self.__sub_reg_split(input_file)
|
tokens = self.__sub_reg_split(input_file)
|
||||||
#correct unicode
|
#correct unicode
|
||||||
#tokens = map(self.__unicode_process, tokens)
|
tokens = map(self.__unicode_process, tokens)
|
||||||
#remove empty items created by removing \uc
|
#remove empty items created by removing \uc
|
||||||
#tokens = filter(lambda x: len(x) > 0, tokens)
|
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||||
|
|
||||||
#write
|
#write
|
||||||
write_obj = open(self.__write_to, 'wb')
|
write_obj = open(self.__write_to, 'wb')
|
||||||
@ -241,4 +260,9 @@ class Tokenize:
|
|||||||
neg_uni_char = int(match_obj.group(1)) * -1
|
neg_uni_char = int(match_obj.group(1)) * -1
|
||||||
# sys.stderr.write(str( neg_uni_char))
|
# sys.stderr.write(str( neg_uni_char))
|
||||||
uni_char = neg_uni_char + 65536
|
uni_char = neg_uni_char + 65536
|
||||||
|
return '&#x' + str('%X' % uni_char) + ';'''
|
||||||
|
'''def __from_ms_to_utf8(self,match_obj):
|
||||||
|
uni_char = int(match_obj.group(1))
|
||||||
|
if uni_char < 0:
|
||||||
|
uni_char += 65536
|
||||||
return '&#x' + str('%X' % uni_char) + ';'''
|
return '&#x' + str('%X' % uni_char) + ';'''
|
Loading…
x
Reference in New Issue
Block a user