Fix trailing spaces after unicode chars

This commit is contained in:
Sengian 2011-09-23 22:44:34 +02:00
parent 79b448726a
commit 4953fa24ea

View File

@ -28,7 +28,7 @@ class Tokenize:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
# self.__out_file = out_file # self.__write_to = out_file
self.__compile_expressions() self.__compile_expressions()
#variables #variables
self.__uc_char = 0 self.__uc_char = 0
@ -41,14 +41,11 @@ class Tokenize:
def __remove_uc_chars(self, startchar, token): def __remove_uc_chars(self, startchar, token):
for i in xrange(startchar, len(token)): for i in xrange(startchar, len(token)):
#handle the case of an uc char with a terminating blank before ansi char if self.__uc_char:
if token[i] == " " and self.__uc_char:
continue
elif self.__uc_char:
self.__uc_char -= 1 self.__uc_char -= 1
else: else:
return token[i:] return token[i:]
#if only " " and char to skip #if only char to skip
return '' return ''
def __unicode_process(self, token): def __unicode_process(self, token):
@ -90,7 +87,7 @@ class Tokenize:
self.__reini_utf8_counters() self.__reini_utf8_counters()
#get value and handle negative case #get value and handle negative case
uni_char = int(match_obj.group(1)) uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(1)) + 2 uni_len = len(match_obj.group(0))
if uni_char < 0: if uni_char < 0:
uni_char += 65536 uni_char += 65536
uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
@ -199,7 +196,7 @@ class Tokenize:
# import sys # import sys
# def main(args=sys.argv): # def main(args=sys.argv):
# if len(args) < 1: # if len(args) < 2:
# print 'No file' # print 'No file'
# return # return
# file = 'data_tokens.txt' # file = 'data_tokens.txt'
@ -211,3 +208,5 @@ class Tokenize:
# if __name__ == '__main__': # if __name__ == '__main__':
# sys.exit(main()) # sys.exit(main())
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py