From 4953fa24ea3476a7b59a8b545103e5e84c44aa87 Mon Sep 17 00:00:00 2001 From: Sengian Date: Fri, 23 Sep 2011 22:44:34 +0200 Subject: [PATCH] Fix trailing spaces after unicode chars --- src/calibre/ebooks/rtf2xml/tokenize.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 10d3fbba6f..97cc074d4d 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -28,7 +28,7 @@ class Tokenize: self.__bug_handler = bug_handler self.__copy = copy self.__write_to = tempfile.mktemp() - # self.__out_file = out_file + # self.__write_to = out_file self.__compile_expressions() #variables self.__uc_char = 0 @@ -41,14 +41,11 @@ class Tokenize: def __remove_uc_chars(self, startchar, token): for i in xrange(startchar, len(token)): - #handle the case of an uc char with a terminating blank before ansi char - if token[i] == " " and self.__uc_char: - continue - elif self.__uc_char: + if self.__uc_char: self.__uc_char -= 1 else: return token[i:] - #if only " " and char to skip + #if only char to skip return '' def __unicode_process(self, token): @@ -90,7 +87,7 @@ class Tokenize: self.__reini_utf8_counters() #get value and handle negative case uni_char = int(match_obj.group(1)) - uni_len = len(match_obj.group(1)) + 2 + uni_len = len(match_obj.group(0)) if uni_char < 0: uni_char += 65536 uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') @@ -199,7 +196,7 @@ class Tokenize: # import sys # def main(args=sys.argv): - # if len(args) < 1: + # if len(args) < 2: # print 'No file' # return # file = 'data_tokens.txt' @@ -211,3 +208,5 @@ class Tokenize: # if __name__ == '__main__': # sys.exit(main()) + +# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py \ No newline at end of file