mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections and regression correction
This commit is contained in:
parent
b9ed0c6b3d
commit
a9fd0ad4ba
@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
parser = ParseRtf(
|
parser = ParseRtf(
|
||||||
in_file = stream,
|
in_file = stream,
|
||||||
out_file = ofile,
|
out_file = ofile,
|
||||||
deb_dir = 'I:\\Calibre\\rtfdebug',
|
deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug',
|
||||||
# Convert symbol fonts to unicode equivalents. Default
|
# Convert symbol fonts to unicode equivalents. Default
|
||||||
# is 1
|
# is 1
|
||||||
convert_symbol = 1,
|
convert_symbol = 1,
|
||||||
|
@ -120,8 +120,6 @@ class ParseRtf:
|
|||||||
script tries to output to directory where is script is exectued.)
|
script tries to output to directory where is script is exectued.)
|
||||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||||
will copy each run through as a file to examine in the debug_dir
|
will copy each run through as a file to examine in the debug_dir
|
||||||
'perl_script'--use perl to make tokens. This runs just a bit faster.
|
|
||||||
(I will probably phase this out.)
|
|
||||||
'check_brackets' -- make sure the brackets match up after each run
|
'check_brackets' -- make sure the brackets match up after each run
|
||||||
through a file. Only for debugging.
|
through a file. Only for debugging.
|
||||||
Returns: Nothing
|
Returns: Nothing
|
||||||
@ -142,7 +140,7 @@ class ParseRtf:
|
|||||||
self.__convert_wingdings = convert_wingdings
|
self.__convert_wingdings = convert_wingdings
|
||||||
self.__convert_zapf = convert_zapf
|
self.__convert_zapf = convert_zapf
|
||||||
self.__run_level = run_level
|
self.__run_level = run_level
|
||||||
#self.__exit_level = 0
|
#self.__exit_level = 0 See what this means and if it is consistent
|
||||||
self.__indent = indent
|
self.__indent = indent
|
||||||
self.__replace_illegals = replace_illegals
|
self.__replace_illegals = replace_illegals
|
||||||
self.__form_lists = form_lists
|
self.__form_lists = form_lists
|
||||||
@ -184,19 +182,15 @@ class ParseRtf:
|
|||||||
A parsed file in XML, either to standard output or to a file,
|
A parsed file in XML, either to standard output or to a file,
|
||||||
depending on the value of 'output' when the instance was created.
|
depending on the value of 'output' when the instance was created.
|
||||||
"""
|
"""
|
||||||
self.__temp_file = self.__make_temp_file(self.__file)
|
|
||||||
#Check to see if the file is correct ascii first
|
#Check to see if the file is correct ascii first
|
||||||
check_encoding_obj = check_encoding.CheckEncoding(
|
check_encoding_obj = check_encoding.CheckEncoding(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
)
|
)
|
||||||
if check_encoding_obj.check_encoding(self.__file):
|
if check_encoding_obj.check_encoding(self.__file):
|
||||||
try:
|
|
||||||
os.remove(self.__temp_file)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
sys.stderr.write('File "%s" does not appear to be ascii.\n' \
|
sys.stderr.write('File "%s" does not appear to be ascii.\n' \
|
||||||
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
% self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
|
||||||
raise InvalidRtfException
|
raise InvalidRtfException
|
||||||
|
self.__temp_file = self.__make_temp_file(self.__file)
|
||||||
# if the self.__deb_dir is true, then create a copy object,
|
# if the self.__deb_dir is true, then create a copy object,
|
||||||
# set the directory to write to, remove files, and copy
|
# set the directory to write to, remove files, and copy
|
||||||
# the new temporary file to this directory
|
# the new temporary file to this directory
|
||||||
@ -223,7 +217,6 @@ class ParseRtf:
|
|||||||
replace_illegals = self.__replace_illegals,
|
replace_illegals = self.__replace_illegals,
|
||||||
)
|
)
|
||||||
line_obj.fix_endings()
|
line_obj.fix_endings()
|
||||||
#return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it?
|
|
||||||
#self.__return_code(return_value)
|
#self.__return_code(return_value)
|
||||||
tokenize_obj = tokenize.Tokenize(
|
tokenize_obj = tokenize.Tokenize(
|
||||||
bug_handler = RtfInvalidCodeException,
|
bug_handler = RtfInvalidCodeException,
|
||||||
@ -550,6 +543,7 @@ class ParseRtf:
|
|||||||
write_file="rtf_write_file"
|
write_file="rtf_write_file"
|
||||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||||
write_obj = open(write_file, 'wb')
|
write_obj = open(write_file, 'wb')
|
||||||
write_obj.write(read_obj.read())
|
for line in read_obj:
|
||||||
|
write_obj.write(line)
|
||||||
write_obj.close()
|
write_obj.close()
|
||||||
return write_file
|
return write_file
|
@ -14,10 +14,10 @@ class CheckEncoding:
|
|||||||
sys.stderr.write(str(msg) + '\n')
|
sys.stderr.write(str(msg) + '\n')
|
||||||
def check_encoding(self, path, encoding='us-ascii'):
|
def check_encoding(self, path, encoding='us-ascii'):
|
||||||
read_obj = open(path, 'r')
|
read_obj = open(path, 'r')
|
||||||
input_file = read_obj.read()
|
|
||||||
read_obj.close()
|
|
||||||
line_num = 0
|
line_num = 0
|
||||||
for line in input_file:
|
error_found = False
|
||||||
|
for line in read_obj:
|
||||||
line_num += 1
|
line_num += 1
|
||||||
try:
|
try:
|
||||||
line.decode(encoding)
|
line.decode(encoding)
|
||||||
@ -26,8 +26,9 @@ class CheckEncoding:
|
|||||||
self.__get_position_error(line, encoding, line_num)
|
self.__get_position_error(line, encoding, line_num)
|
||||||
else:
|
else:
|
||||||
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
sys.stderr.write('line: %d has bad encoding\n'%line_num)
|
||||||
return True
|
error_found = True
|
||||||
return False
|
read_obj.close()
|
||||||
|
return error_found
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
check_encoding_obj = CheckEncoding()
|
check_encoding_obj = CheckEncoding()
|
||||||
|
@ -23,6 +23,7 @@ class Copy:
|
|||||||
def __init__(self, bug_handler, file = None, deb_dir = None, ):
|
def __init__(self, bug_handler, file = None, deb_dir = None, ):
|
||||||
self.__file = file
|
self.__file = file
|
||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
|
|
||||||
def set_dir(self, deb_dir):
|
def set_dir(self, deb_dir):
|
||||||
"""Set the temporary directory to write files to"""
|
"""Set the temporary directory to write files to"""
|
||||||
if deb_dir is None:
|
if deb_dir is None:
|
||||||
@ -33,19 +34,11 @@ class Copy:
|
|||||||
message = "%(deb_dir)s is not a directory" % vars()
|
message = "%(deb_dir)s is not a directory" % vars()
|
||||||
raise self.__bug_handler , message
|
raise self.__bug_handler , message
|
||||||
Copy.__dir = deb_dir
|
Copy.__dir = deb_dir
|
||||||
|
|
||||||
def remove_files(self ):
|
def remove_files(self ):
|
||||||
"""Remove files from directory"""
|
"""Remove files from directory"""
|
||||||
self.__remove_the_files(Copy.__dir)
|
self.__remove_the_files(Copy.__dir)
|
||||||
"""
|
|
||||||
list_of_files = os.listdir(Copy.__dir)
|
|
||||||
list_of_files = os.listdir(the_dir)
|
|
||||||
for file in list_of_files:
|
|
||||||
rem_file = os.path.join(Copy.__dir,file)
|
|
||||||
if os.path.isdir(rem_file):
|
|
||||||
self.remove_files(rem_file)
|
|
||||||
else:
|
|
||||||
os.remove(rem_file)
|
|
||||||
"""
|
|
||||||
def __remove_the_files(self, the_dir):
|
def __remove_the_files(self, the_dir):
|
||||||
"""Remove files from directory"""
|
"""Remove files from directory"""
|
||||||
list_of_files = os.listdir(the_dir)
|
list_of_files = os.listdir(the_dir)
|
||||||
@ -58,6 +51,7 @@ class Copy:
|
|||||||
os.remove(rem_file)
|
os.remove(rem_file)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def copy_file(self, file, new_file):
|
def copy_file(self, file, new_file):
|
||||||
"""
|
"""
|
||||||
Copy the file to a new name
|
Copy the file to a new name
|
||||||
|
@ -735,8 +735,94 @@ class ProcessTokens:
|
|||||||
pre, token, action = self.dict_token.get(token, (None, None, None))
|
pre, token, action = self.dict_token.get(token, (None, None, None))
|
||||||
if action:
|
if action:
|
||||||
return action(pre, token, num)
|
return action(pre, token, num)
|
||||||
# unused function
|
|
||||||
def initiate_token_actions(self):
|
def __check_brackets(self, in_file):
|
||||||
|
self.__check_brack_obj = check_brackets.CheckBrackets\
|
||||||
|
(file = in_file)
|
||||||
|
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||||
|
if not good_br:
|
||||||
|
return 1
|
||||||
|
def process_tokens(self):
|
||||||
|
"""Main method for handling other methods. """
|
||||||
|
|
||||||
|
read_obj= open(self.__file, 'r')
|
||||||
|
write_obj = open(self.__write_to, 'wb')
|
||||||
|
|
||||||
|
'''first_token = 0
|
||||||
|
second_token = 0'''
|
||||||
|
line_count = 0
|
||||||
|
|
||||||
|
for line in read_obj:
|
||||||
|
token = line.replace("\n","")
|
||||||
|
#calibre not necessary normaly, fixed in tokenize
|
||||||
|
'''if not token:
|
||||||
|
continue'''
|
||||||
|
line_count += 1
|
||||||
|
#calibre not necessary, encoding checked before
|
||||||
|
"""try:
|
||||||
|
token.decode('us-ascii')
|
||||||
|
except UnicodeError, msg:
|
||||||
|
msg = str(msg)
|
||||||
|
msg += 'Invalid RTF: File not ascii encoded.\n'
|
||||||
|
raise self.__exception_handler, msg"""
|
||||||
|
#calibre: with tokenize, should be first and second line, why bother?
|
||||||
|
"""if not first_token:
|
||||||
|
if token != '\\{':
|
||||||
|
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
first_token = 1
|
||||||
|
elif line_count == and not second_token:
|
||||||
|
if token[0:4] != '\\rtf':
|
||||||
|
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
second_token = 1"""
|
||||||
|
if line_count == 1 and token != '\\{':
|
||||||
|
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||||
|
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
|
||||||
|
##token = self.evaluate_token(token)
|
||||||
|
the_index = token.find('\\ ')
|
||||||
|
if token is not None and the_index > -1:
|
||||||
|
msg ='Invalid RTF: token "\\ " not valid.\n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
elif token[:1] == "\\":
|
||||||
|
line = self.process_cw(token)
|
||||||
|
if line is not None:
|
||||||
|
write_obj.write(line)
|
||||||
|
else:
|
||||||
|
fields = re.split(self.__utf_exp, token)
|
||||||
|
for field in fields:
|
||||||
|
if not field:
|
||||||
|
continue
|
||||||
|
if field[0:1] == '&':
|
||||||
|
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||||
|
else:
|
||||||
|
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||||
|
|
||||||
|
read_obj.close()
|
||||||
|
write_obj.close()
|
||||||
|
|
||||||
|
if not line_count:
|
||||||
|
msg ='Invalid RTF: file appears to be empty.\n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
|
||||||
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||||
|
if self.__copy:
|
||||||
|
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
||||||
|
copy_obj.rename(self.__write_to, self.__file)
|
||||||
|
os.remove(self.__write_to)
|
||||||
|
|
||||||
|
bad_brackets = self.__check_brackets(self.__file)
|
||||||
|
if bad_brackets:
|
||||||
|
msg = 'Invalid RTF: document does not have matching brackets.\n'
|
||||||
|
raise self.__exception_handler, msg
|
||||||
|
else:
|
||||||
|
return self.__return_code
|
||||||
|
|
||||||
|
'''def initiate_token_actions(self):
|
||||||
self.action_for_token={
|
self.action_for_token={
|
||||||
'{' : self.ob_func,
|
'{' : self.ob_func,
|
||||||
'}' : self.cb_func,
|
'}' : self.cb_func,
|
||||||
@ -752,75 +838,4 @@ class ProcessTokens:
|
|||||||
line = action(token)
|
line = action(token)
|
||||||
return line
|
return line
|
||||||
else :
|
else :
|
||||||
return 'tx<nu<nu<nu<nu<%s\n' % token
|
return 'tx<nu<nu<nu<nu<%s\n' % token'''
|
||||||
def __check_brackets(self, in_file):
|
|
||||||
self.__check_brack_obj = check_brackets.CheckBrackets\
|
|
||||||
(file = in_file)
|
|
||||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
|
||||||
if not good_br:
|
|
||||||
return 1
|
|
||||||
def process_tokens(self):
|
|
||||||
"""Main method for handling other methods. """
|
|
||||||
first_token = 0
|
|
||||||
second_token = 0
|
|
||||||
read_obj = open(self.__file, 'r')
|
|
||||||
write_obj = open(self.__write_to, 'w')
|
|
||||||
line_to_read = "dummy"
|
|
||||||
line_count = 0
|
|
||||||
while line_to_read:
|
|
||||||
line_to_read = read_obj.readline()
|
|
||||||
token = line_to_read
|
|
||||||
token = token.replace("\n","")
|
|
||||||
if not token:
|
|
||||||
continue
|
|
||||||
line_count += 1
|
|
||||||
try:
|
|
||||||
token.decode('us-ascii')
|
|
||||||
except UnicodeError, msg:
|
|
||||||
msg = str(msg)
|
|
||||||
msg += 'Invalid RTF: File not ascii encoded.\n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
if not first_token:
|
|
||||||
if token != '\\{':
|
|
||||||
msg = 'Invalid RTF: document doesn\'t start with {\n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
first_token = 1
|
|
||||||
elif first_token and not second_token:
|
|
||||||
if token[0:4] != '\\rtf':
|
|
||||||
msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
second_token = 1
|
|
||||||
##token = self.evaluate_token(token)
|
|
||||||
the_index = token.find('\\ ')
|
|
||||||
if token != None and the_index > -1:
|
|
||||||
msg ='Invalid RTF: token "\\ " not valid. \n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
elif token[0:1] == "\\":
|
|
||||||
line = self.process_cw(token)
|
|
||||||
if line != None:
|
|
||||||
write_obj.write(line)
|
|
||||||
else:
|
|
||||||
fields = re.split(self.__utf_exp, token)
|
|
||||||
for field in fields:
|
|
||||||
if not field:
|
|
||||||
continue
|
|
||||||
if field[0:1] == '&':
|
|
||||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
|
||||||
else:
|
|
||||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
|
||||||
read_obj.close()
|
|
||||||
write_obj.close()
|
|
||||||
if not line_count:
|
|
||||||
msg ='Invalid RTF: file appears to be empty. \n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
|
||||||
if self.__copy:
|
|
||||||
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
|
||||||
copy_obj.rename(self.__write_to, self.__file)
|
|
||||||
os.remove(self.__write_to)
|
|
||||||
bad_brackets = self.__check_brackets(self.__file)
|
|
||||||
if bad_brackets:
|
|
||||||
msg = 'Invalid RTF: document does not have matching brackets.\n'
|
|
||||||
raise self.__exception_handler, msg
|
|
||||||
else:
|
|
||||||
return self.__return_code
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user