Correct handling of \ as \par for old RTF

This commit is contained in:
Sengian 2011-01-15 13:21:13 +01:00
parent c8810ce522
commit 10c2e603e2
5 changed files with 36 additions and 14 deletions

View File

@ -312,7 +312,6 @@ class RTFInput(InputFormatPlugin):
try: try:
xml = self.generate_xml(stream.name) xml = self.generate_xml(stream.name)
except RtfInvalidCodeException, e: except RtfInvalidCodeException, e:
raise
raise ValueError(_('This RTF file has a feature calibre does not ' raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e) 'support. Convert it to HTML first and then try it.\n%s')%e)

View File

@ -226,10 +226,6 @@ class ParseRtf:
try: try:
return_value = process_tokens_obj.process_tokens() return_value = process_tokens_obj.process_tokens()
except InvalidRtfException, msg: except InvalidRtfException, msg:
try:
os.remove(self.__temp_file)
except OSError:
pass
#Check to see if the file is correctly encoded #Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding( encode_obj = default_encoding.DefaultEncoding(
in_file = self.__temp_file, in_file = self.__temp_file,
@ -244,11 +240,16 @@ class ParseRtf:
enc = encode_obj.get_codepage() enc = encode_obj.get_codepage()
if enc != 'mac_roman': if enc != 'mac_roman':
enc = 'cp' + enc enc = 'cp' + enc
msg = 'Exception in token processing'
if check_encoding_obj.check_encoding(self.__file, enc): if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, str) \ file_name = self.__file if isinstance(self.__file, str) \
else self.__file.encode('utf-8') else self.__file.encode('utf-8')
msg = 'File %s does not appear to be correctly encoded.\n' % file_name msg = 'File %s does not appear to be correctly encoded.\n' % file_name
raise InvalidRtfException, msg try:
os.remove(self.__temp_file)
except OSError:
pass
raise InvalidRtfException, msg
delete_info_obj = delete_info.DeleteInfo( delete_info_obj = delete_info.DeleteInfo(
in_file = self.__temp_file, in_file = self.__temp_file,
copy = self.__copy, copy = self.__copy,

View File

@ -3,6 +3,7 @@
# copyright 2002 Paul Henry Tremblay # # copyright 2002 Paul Henry Tremblay #
# # # #
######################################################################### #########################################################################
''' '''
Codepages as to RTF 1.9.1: Codepages as to RTF 1.9.1:
437 United States IBM 437 United States IBM

View File

@ -70,7 +70,7 @@ class ProcessTokens:
';' : ('mc', ';', self.ms_sub_func), ';' : ('mc', ';', self.ms_sub_func),
# this must be wrong # this must be wrong
'-' : ('mc', '-', self.ms_sub_func), '-' : ('mc', '-', self.ms_sub_func),
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre 'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre
# misc => ml # misc => ml
'*' : ('ml', 'asterisk__', self.default_func), '*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func), ':' : ('ml', 'colon_____', self.default_func),
@ -605,7 +605,7 @@ class ProcessTokens:
def ms_sub_func(self, pre, token, num): def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token return 'tx<mc<__________<%s\n' % token
def hardline_func(self, pre, token, num): def direct_conv_func(self, pre, token, num):
return 'mi<tg<empty_____<%s\n' % token return 'mi<tg<empty_____<%s\n' % token
def default_func(self, pre, token, num): def default_func(self, pre, token, num):

View File

@ -27,11 +27,13 @@ class Tokenize:
bug_handler, bug_handler,
copy = None, copy = None,
run_level = 1, run_level = 1,
): # out_file = None,
):
self.__file = in_file self.__file = in_file
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
self.__copy = copy self.__copy = copy
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
# self.__out_file = out_file
self.__compile_expressions() self.__compile_expressions()
#variables #variables
self.__uc_char = 0 self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:
def __sub_reg_split(self,input_file): def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file) input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data #remove \n in bin data
@ -127,7 +131,7 @@ class Tokenize:
# this is for older RTF # this is for older RTF
#line = re.sub(self.__par_exp, '\\par ', line) #line = re.sub(self.__par_exp, '\\par ', line)
#return filter(lambda x: len(x) > 0, \ #return filter(lambda x: len(x) > 0, \
#(self.__remove_line.sub('', x) for x in tokens)) #(self.__remove_line.sub('', x) for x in tokens))
def __compile_expressions(self): def __compile_expressions(self):
SIMPLE_RPL = { SIMPLE_RPL = {
@ -153,8 +157,6 @@ class Tokenize:
# put a backslash in front of to eliminate special cases and # put a backslash in front of to eliminate special cases and
# make processing easier # make processing easier
"}": "\\}", "}": "\\}",
# this is for older RTF
r'\\$': '\\par ',
} }
self.__replace_spchar = MReplace(SIMPLE_RPL) self.__replace_spchar = MReplace(SIMPLE_RPL)
#add ;? in case of char following \u #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
#why keep backslash whereas \is replaced before? #why keep backslash whereas \is replaced before?
#remove \n from endline char #remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#this is for old RTF
self.__par_exp = re.compile(r'\\\n+')
# self.__par_exp = re.compile(r'\\$')
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
#self.__par_exp = re.compile(r'\\$')
#self.__remove_line = re.compile(r'\n+') #self.__remove_line = re.compile(r'\n+')
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.copy_file(self.__write_to, "tokenize.data")
# if self.__out_file:
# self.__file = self.__out_file
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)
#self.__special_tokens = [ '_', '~', "'", '{', '}' ] #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
# import sys
# def main(args=sys.argv):
# if len(args) < 1:
# print 'No file'
# return
# file = 'data_tokens.txt'
# if len(args) == 3:
# file = args[2]
# to = Tokenize(args[1], Exception, out_file = file)
# to.tokenize()
# if __name__ == '__main__':
# sys.exit(main())