mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Correct handling of \ as \par for old RTF
This commit is contained in:
parent
c8810ce522
commit
10c2e603e2
@ -312,7 +312,6 @@ class RTFInput(InputFormatPlugin):
|
|||||||
try:
|
try:
|
||||||
xml = self.generate_xml(stream.name)
|
xml = self.generate_xml(stream.name)
|
||||||
except RtfInvalidCodeException, e:
|
except RtfInvalidCodeException, e:
|
||||||
raise
|
|
||||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||||
|
|
||||||
|
@ -226,10 +226,6 @@ class ParseRtf:
|
|||||||
try:
|
try:
|
||||||
return_value = process_tokens_obj.process_tokens()
|
return_value = process_tokens_obj.process_tokens()
|
||||||
except InvalidRtfException, msg:
|
except InvalidRtfException, msg:
|
||||||
try:
|
|
||||||
os.remove(self.__temp_file)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
#Check to see if the file is correctly encoded
|
#Check to see if the file is correctly encoded
|
||||||
encode_obj = default_encoding.DefaultEncoding(
|
encode_obj = default_encoding.DefaultEncoding(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
@ -244,11 +240,16 @@ class ParseRtf:
|
|||||||
enc = encode_obj.get_codepage()
|
enc = encode_obj.get_codepage()
|
||||||
if enc != 'mac_roman':
|
if enc != 'mac_roman':
|
||||||
enc = 'cp' + enc
|
enc = 'cp' + enc
|
||||||
|
msg = 'Exception in token processing'
|
||||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||||
file_name = self.__file if isinstance(self.__file, str) \
|
file_name = self.__file if isinstance(self.__file, str) \
|
||||||
else self.__file.encode('utf-8')
|
else self.__file.encode('utf-8')
|
||||||
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
msg = 'File %s does not appear to be correctly encoded.\n' % file_name
|
||||||
raise InvalidRtfException, msg
|
try:
|
||||||
|
os.remove(self.__temp_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise InvalidRtfException, msg
|
||||||
delete_info_obj = delete_info.DeleteInfo(
|
delete_info_obj = delete_info.DeleteInfo(
|
||||||
in_file = self.__temp_file,
|
in_file = self.__temp_file,
|
||||||
copy = self.__copy,
|
copy = self.__copy,
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
# copyright 2002 Paul Henry Tremblay #
|
# copyright 2002 Paul Henry Tremblay #
|
||||||
# #
|
# #
|
||||||
#########################################################################
|
#########################################################################
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Codepages as to RTF 1.9.1:
|
Codepages as to RTF 1.9.1:
|
||||||
437 United States IBM
|
437 United States IBM
|
||||||
|
@ -70,7 +70,7 @@ class ProcessTokens:
|
|||||||
';' : ('mc', ';', self.ms_sub_func),
|
';' : ('mc', ';', self.ms_sub_func),
|
||||||
# this must be wrong
|
# this must be wrong
|
||||||
'-' : ('mc', '-', self.ms_sub_func),
|
'-' : ('mc', '-', self.ms_sub_func),
|
||||||
'line' : ('mi', 'hardline-break', self.hardline_func), #calibre
|
'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre
|
||||||
# misc => ml
|
# misc => ml
|
||||||
'*' : ('ml', 'asterisk__', self.default_func),
|
'*' : ('ml', 'asterisk__', self.default_func),
|
||||||
':' : ('ml', 'colon_____', self.default_func),
|
':' : ('ml', 'colon_____', self.default_func),
|
||||||
@ -605,7 +605,7 @@ class ProcessTokens:
|
|||||||
def ms_sub_func(self, pre, token, num):
|
def ms_sub_func(self, pre, token, num):
|
||||||
return 'tx<mc<__________<%s\n' % token
|
return 'tx<mc<__________<%s\n' % token
|
||||||
|
|
||||||
def hardline_func(self, pre, token, num):
|
def direct_conv_func(self, pre, token, num):
|
||||||
return 'mi<tg<empty_____<%s\n' % token
|
return 'mi<tg<empty_____<%s\n' % token
|
||||||
|
|
||||||
def default_func(self, pre, token, num):
|
def default_func(self, pre, token, num):
|
||||||
|
@ -27,11 +27,13 @@ class Tokenize:
|
|||||||
bug_handler,
|
bug_handler,
|
||||||
copy = None,
|
copy = None,
|
||||||
run_level = 1,
|
run_level = 1,
|
||||||
):
|
# out_file = None,
|
||||||
|
):
|
||||||
self.__file = in_file
|
self.__file = in_file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
self.__copy = copy
|
self.__copy = copy
|
||||||
self.__write_to = tempfile.mktemp()
|
self.__write_to = tempfile.mktemp()
|
||||||
|
# self.__out_file = out_file
|
||||||
self.__compile_expressions()
|
self.__compile_expressions()
|
||||||
#variables
|
#variables
|
||||||
self.__uc_char = 0
|
self.__uc_char = 0
|
||||||
@ -113,6 +115,8 @@ class Tokenize:
|
|||||||
|
|
||||||
def __sub_reg_split(self,input_file):
|
def __sub_reg_split(self,input_file):
|
||||||
input_file = self.__replace_spchar.mreplace(input_file)
|
input_file = self.__replace_spchar.mreplace(input_file)
|
||||||
|
# this is for older RTF
|
||||||
|
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
||||||
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
|
||||||
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
|
||||||
#remove \n in bin data
|
#remove \n in bin data
|
||||||
@ -127,7 +131,7 @@ class Tokenize:
|
|||||||
# this is for older RTF
|
# this is for older RTF
|
||||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||||
#return filter(lambda x: len(x) > 0, \
|
#return filter(lambda x: len(x) > 0, \
|
||||||
#(self.__remove_line.sub('', x) for x in tokens))
|
#(self.__remove_line.sub('', x) for x in tokens))
|
||||||
|
|
||||||
def __compile_expressions(self):
|
def __compile_expressions(self):
|
||||||
SIMPLE_RPL = {
|
SIMPLE_RPL = {
|
||||||
@ -153,8 +157,6 @@ class Tokenize:
|
|||||||
# put a backslash in front of to eliminate special cases and
|
# put a backslash in front of to eliminate special cases and
|
||||||
# make processing easier
|
# make processing easier
|
||||||
"}": "\\}",
|
"}": "\\}",
|
||||||
# this is for older RTF
|
|
||||||
r'\\$': '\\par ',
|
|
||||||
}
|
}
|
||||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||||
#add ;? in case of char following \u
|
#add ;? in case of char following \u
|
||||||
@ -168,10 +170,12 @@ class Tokenize:
|
|||||||
#why keep backslash whereas \is replaced before?
|
#why keep backslash whereas \is replaced before?
|
||||||
#remove \n from endline char
|
#remove \n from endline char
|
||||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||||
|
#this is for old RTF
|
||||||
|
self.__par_exp = re.compile(r'\\\n+')
|
||||||
|
# self.__par_exp = re.compile(r'\\$')
|
||||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||||
#self.__par_exp = re.compile(r'\\$')
|
|
||||||
#self.__remove_line = re.compile(r'\n+')
|
#self.__remove_line = re.compile(r'\n+')
|
||||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||||
@ -199,7 +203,24 @@ class Tokenize:
|
|||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||||
|
# if self.__out_file:
|
||||||
|
# self.__file = self.__out_file
|
||||||
copy_obj.rename(self.__write_to, self.__file)
|
copy_obj.rename(self.__write_to, self.__file)
|
||||||
os.remove(self.__write_to)
|
os.remove(self.__write_to)
|
||||||
|
|
||||||
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
#self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||||
|
|
||||||
|
# import sys
|
||||||
|
# def main(args=sys.argv):
|
||||||
|
# if len(args) < 1:
|
||||||
|
# print 'No file'
|
||||||
|
# return
|
||||||
|
# file = 'data_tokens.txt'
|
||||||
|
# if len(args) == 3:
|
||||||
|
# file = args[2]
|
||||||
|
# to = Tokenize(args[1], Exception, out_file = file)
|
||||||
|
# to.tokenize()
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user